diff --git a/.gitignore b/.gitignore index 7ba7b6f2e..5e9a5727a 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,8 @@ !/**/ !*.* !*/*/makefile +!*/*/Makefile +!*/*/LICENSE *.o *.a diff --git a/3rdParty/gslib.github/.travis.yml b/3rdParty/gslib.github/.travis.yml deleted file mode 100644 index a0417ea68..000000000 --- a/3rdParty/gslib.github/.travis.yml +++ /dev/null @@ -1,11 +0,0 @@ -language: c - -before_install: - - export ROOT_DIR=`pwd` - - sudo apt-get update -qq - - sudo apt-get install -y mpich2 libmpich2-dev - -install: true - -script: - - make diff --git a/3rdParty/gslib.github/Makefile b/3rdParty/gslib.github/Makefile deleted file mode 100644 index bd51d44b7..000000000 --- a/3rdParty/gslib.github/Makefile +++ /dev/null @@ -1,21 +0,0 @@ -CC=mpicc -AR=ar - -CPP=-DUSE_NAIVE_BLAS=1 -DNO_NEK_EXITT=1 -DMPI=1 -DUNDERSCORE=1 -DGLOBAL_LONG_LONG -DPREFIX=jl_ - -SOURCES = gs.c sort.c sarray_transfer.c sarray_sort.c \ -gs_local.c crystal.c comm.c tensor.c fail.c fcrystal.c \ -xxt.c sparse_cholesky.c - -OBJECTS = $(SOURCES:%.c=obj/%.o) - -libgs.a: $(OBJECTS) - $(AR) -rs $@ $(OBJECTS) - -obj/%.o: src/%.c - mkdir -p obj - $(CC) $(CPP) -O3 -c $< -o $@ - -clean: - rm -rf obj - rm libgs.a diff --git a/3rdParty/gslib.github/README b/3rdParty/gslib.github/README deleted file mode 100644 index e75e63dcc..000000000 --- a/3rdParty/gslib.github/README +++ /dev/null @@ -1,111 +0,0 @@ - -A high-level view of the code in this directory is as follows. See each header -file listed for more documentation. - -The following headers are fundamental to most of the code. - - name.h: a given prefix is added to all external symbols; - determines how FORTRAN routines are named - types.h: defines the integer types used everywhere (e.g., for array indices) - mem.h: memory-management wrappers; - "array" type (generic dynamically sized array); - "buffer" type ( = char array ) - comm.h: wrappers for MPI calls (with alternative single proc versions) - -The Gather/Scatter library top-level interface is defined in "gs.h". -The file "gs_defs.h" defines the datatypes and operations that it supports. - -There are two coarse solvers (XXT and AMG), which are not currently very well -documented. The interface is given in "crs.h". - -"findpts" is documented in "findpts.c". The idea is that during a run of an -SEM code, we have a geometry map - (processor, element, r, s, t) -> (x, y, z) -that defines our mesh. Within each element, the xyz coordinate is a -polynomial function of the parametric r,s,t coordinates. -"findpts" takes a distributed list of "(x,y,z)" points and computes the inverse -of the above map. -"findpts_eval" takes a list of "(proc,el,r,s,t)" coords, e.g., as returned by - "findpts", and interpolates a given field at each point. - - -The "workhorses" of the implementations of much of the above are the -"sarray_sort" and "sarray_transfer" routines, documented in the respective -headers. The "array" type, defined in "mem.h", can be used to keep track of a -dynamically sized array of (arbitrary) structs. - - sarray_sort.h: - sort an array of structs (locally/sequentially) by one or two of its fields - sarray_transfer.h: - transfer each struct in array to the processor specified by a given field - -These in turn, are implemented using the lower-level routines of -"sort.h", and "crystal_router.h". - - -The "findpts" algorithm makes use of a number of lower-level routines -possibly useful on their own. - - poly.h: computation of quadrature nodes; fast polynomial interpolation - lob_bnd.h: (relatively) fast yet robust bounds for polynomials on [-1,1]^d - obbox.h: oriented as well as axis-aligned bounding boxes for spectral els - tensor.h: some tensor-product applications, - with BLAS ops delegated to Nek, cblas, or a naive imp - -All of the preprocessor macros that affect compilation are: - name.h: PREFIX="..." prefix added to all C external symbols - FPREFIX="..." prefix added to all FORTRAN routines - UPCASE, UNDERSCORE determines FORTRAN naming convention - types.h: USE_LONG, USE_LONG_LONG, GLOBAL_LONG, GLOBAL_LONG_LONG - determine the integer types used by all code - mem.h: PRINT_MALLOCS=1 (print all mem mngmt to stdout) - comm.h: MPI (use MPI when defined; - otherwise, use a dummy single-proc implementation) - tensor.h: USE_CBLAS, USE_NAIVE_BLAS - (select BLAS implementation; default is Nek's mxm) - fail.c: NO_NEK_EXITT when defined, don't call Nek's exitt routine - amg.c: AMG_BLOCK_ROWS number of rows to read at a time (default=1200) - GS_TIMING record timings for the matrix multiplies - GS_BARRIER use a barrier to improve the quality of the timings - - - -Differences from JL listed below -===================== - Descriptions: -===================== - -This directory (src/jl2) includes a newly tuned gather-scatter routine updated by -Matthew Otten (at Cornell), using the similar idea of OpenACC gather-scatter kernel -previously done by Aaron Vose (Cray Inc) and Matthew Otten in this paper: - http://www.mcs.anl.gov/~mmin/hack_nekcem.pdf (pp.7, say version 15.0) - -This newer version (src/jl2), say 15.1, has the following features: - - 1. Keeps the same structure of James Lottes' original gs-routines; - 2. Uses new map arrays (for effective use of vectorization/streaming on GPU); - 3. Gives similar levels of performance on the CPU. - -===================== - How to use it: -===================== - -To test with our newly tuned gather-scatter kernel: there would be nothing -to change in the source code. You simply change our makefile to compile with src/jl2 -(instead of the current src/jl). - -For OpenACC GPU runs w/ GPUDirect: require "-DGPUDIRECT" option at compile time. -For OpenACC GPU runs w/o GPUDirect: no additional option required. -For CPU-only runs : no additional option required. - -===================== - Difference: -===================== -The only difference between (src/jl) vs. (src/jl2) is the following files: - src/jl2/gs.c - src/jl2/gs_local.c - src/jl2/gs_local.h - src/jl2/comm.c - - - diff --git a/3rdParty/gslib.github/examples/simple_tests/Makefile b/3rdParty/gslib.github/examples/simple_tests/Makefile deleted file mode 100644 index c20e62915..000000000 --- a/3rdParty/gslib.github/examples/simple_tests/Makefile +++ /dev/null @@ -1,42 +0,0 @@ -CC=mpicc #-std=c99 --pedantic -CFLAGS+=-DMPI -#CFLAGS+= -acc -ta=nvidia -Minfo=accel -CFLAGS+=-DPREFIX=jl_ -CFLAGS+=-DNO_NEK_EXITT -CFLAGS+=-DGLOBAL_LONG -LDFLAGS+=-lm #-acc -SRCLOC=../../src -#CFLAGS+=-DPRINT_MALLOCS=1 - -CFLAGS+=-DUSE_NAIVE_BLAS -#CFLAGS+=-DUSE_CBLAS -#LDFLAGS+=-lcblas - -#CFLAGS+=-DAMG_DUMP -CFLAGS+=-DGS_TIMING -DGS_BARRIER - -CFLAGS+=-O0 -g -#CFLAGS+=-O3 #-march=native - -#CFLAGS+=-W #-Wall -Wno-unused-function -Wno-unused-parameter -#CFLAGS+=-Minform=warn - -CCCMD=$(CC) $(G) $(CFLAGS) -LINKCMD=$(CC) $(G) $(LDFLAGS) -#RLINKCMD = $(LD) -r -all: gslib_test gs_alltoall gs_allreduce -clean: ; @$(RM) $(TESTS) *.o *.s - -cmds: ; @echo CC = $(CCCMD); echo LINK = $(LINKCMD); - -%.o: %.c ; @echo CC $<; $(CCCMD) -c $< -o $@ -%.s: %.c ; @echo CC -S $<; $(CCCMD) -S $< -o $@ -objects: $(OBJECTS) ; - -GS_OBJECTS=$(SRCLOC)/gs.o $(SRCLOC)/sort.o $(SRCLOC)/sarray_transfer.o $(SRCLOC)/sarray_sort.o \ - $(SRCLOC)/gs_local.o $(SRCLOC)/fail.o $(SRCLOC)/crystal.o $(SRCLOC)/comm.o $(SRCLOC)/tensor.o - -gslib_test: gslib_test.o $(GS_OBJECTS); @echo LINK $@; $(LINKCMD) $^ -lm -o $@ -gs_alltoall: gs_alltoall.o $(GS_OBJECTS); @echo LINK $@; $(LINKCMD) $^ -o $@ -gs_allreduce: gs_allreduce.o $(GS_OBJECTS); @echo LINK $@; $(LINKCMD) $^ -o $@ - diff --git a/3rdParty/gslib.github/examples/simple_tests/README_ALLTOALL b/3rdParty/gslib.github/examples/simple_tests/README_ALLTOALL deleted file mode 100644 index 251946f72..000000000 --- a/3rdParty/gslib.github/examples/simple_tests/README_ALLTOALL +++ /dev/null @@ -1,14 +0,0 @@ -To make alltoall: - -make gs_alltoall - -To run: -mpiexec -n np ./gs_alltoall - -Will do 10000 samples of each alltoall method (MPI and GS). - -To make gslib_test - -make gslib_test -To run: -mpiexec -n np ./gslib_test diff --git a/3rdParty/gslib.github/examples/simple_tests/a.map b/3rdParty/gslib.github/examples/simple_tests/a.map deleted file mode 100644 index 8082c9c71..000000000 --- a/3rdParty/gslib.github/examples/simple_tests/a.map +++ /dev/null @@ -1,4 +0,0 @@ - 3 8 1 2 12 8 0 - 1 6 5 8 4 - 1 8 4 7 3 - 0 2 8 1 7 diff --git a/3rdParty/gslib.github/examples/simple_tests/b.map b/3rdParty/gslib.github/examples/simple_tests/b.map deleted file mode 100644 index 883783310..000000000 --- a/3rdParty/gslib.github/examples/simple_tests/b.map +++ /dev/null @@ -1,6 +0,0 @@ - 5 11 2 4 20 11 0 - 3 5 4 11 10 - 2 3 11 2 7 - 1 11 10 9 6 - 3 11 9 7 8 - 0 9 6 8 1 diff --git a/3rdParty/gslib.github/examples/simple_tests/c.map b/3rdParty/gslib.github/examples/simple_tests/c.map deleted file mode 100644 index a98d7129a..000000000 --- a/3rdParty/gslib.github/examples/simple_tests/c.map +++ /dev/null @@ -1,5121 +0,0 @@ - 5120 5281 12 4096 20480 5281 0 - 2549 3164 3163 3171 3162 - 2551 3212 3211 3164 3163 - 2551 3210 3212 3165 3164 - 2550 3165 3164 3172 3171 - 2411 3013 3012 3212 3211 - 2409 3021 3011 3013 3012 - 2410 3020 3021 3015 3013 - 2411 3015 3013 3210 3212 - 2547 3161 3165 3170 3172 - 2547 3209 3210 3161 3165 - 2545 3179 3161 3178 3170 - 2546 3208 3209 3179 3161 - 2415 3014 3015 3209 3210 - 2414 3034 3014 3208 3209 - 2413 3036 3019 3034 3014 - 2415 3019 3020 3014 3015 - 2419 3033 3037 3031 3036 - 2429 3028 3026 3027 3033 - 2431 3026 3035 3033 3037 - 2428 3027 3033 3032 3031 - 2431 3045 3044 3026 3035 - 2395 2995 3006 3045 3044 - 2430 3043 3045 3028 3026 - 2395 2997 2995 3043 3045 - 2427 3042 3043 3025 3028 - 2427 3054 3042 3053 3025 - 2399 2996 2997 3042 3043 - 2398 3052 2996 3054 3042 - 2425 3030 3027 3029 3032 - 2426 3053 3025 3051 3030 - 2424 3025 3028 3030 3027 - 2423 3051 3030 3050 3029 - 2408 3018 3017 3021 3011 - 2405 3010 3008 3009 3018 - 2407 3008 43 3018 3017 - 2404 3009 3018 3020 3021 - 2401 3016 3009 3019 3020 - 2403 3035 3007 3037 3016 - 2412 3037 3016 3036 3019 - 2400 3007 3010 3016 3009 - 2407 3041 3040 3008 43 - 2406 3039 3041 3010 3008 - 2371 2980 2978 3039 3041 - 2370 2978 2977 3041 3040 - 2402 3038 3039 3007 3010 - 2375 2979 2980 3038 3039 - 2374 3006 2979 3044 3038 - 2403 3044 3038 3035 3007 - 2533 3150 3179 3157 3178 - 2535 3207 3206 3151 3150 - 2534 3151 3150 3158 3157 - 2535 3206 3208 3150 3179 - 2421 3023 3024 3205 3207 - 2420 3049 3023 3204 3205 - 2423 3050 3029 3049 3023 - 2422 3029 3032 3023 3024 - 2417 3022 3034 3206 3208 - 2418 3032 3031 3024 3022 - 2419 3031 3036 3022 3034 - 2416 3024 3022 3207 3206 - 2531 3149 3151 3156 3158 - 2529 3196 3149 3195 3156 - 2531 3205 3207 3149 3151 - 2530 3204 3205 3196 3149 - 2485 3096 3196 3103 3195 - 2487 3097 3096 3104 3103 - 2486 3203 3204 3096 3196 - 2487 3202 3203 3097 3096 - 2347 2945 3049 3203 3204 - 2345 2952 3050 2945 3049 - 2347 2947 2945 3202 3203 - 2346 2951 2952 2947 2945 - 2483 3095 3097 3102 3104 - 2482 3201 3200 3110 3095 - 2481 3110 3095 3109 3102 - 2483 3200 3202 3095 3097 - 2351 2946 2947 3200 3202 - 2350 2965 2946 3201 3200 - 2351 2950 2951 2946 2947 - 2349 2967 2950 2965 2946 - 2367 2964 2968 2963 2967 - 2355 2953 2966 2964 2968 - 2353 2955 2953 2961 2964 - 2352 2961 2964 2959 2963 - 2359 2975 2974 2954 2955 - 2327 3231 2926 3230 2975 - 2358 3230 2975 3229 2954 - 2327 2926 2927 2975 2974 - 2363 2962 2961 2960 2959 - 2357 3228 2962 3227 2960 - 2359 2954 2955 2962 2961 - 2356 3229 2954 3228 2962 - 2355 2973 2972 2953 2966 - 2323 2927 2925 2974 2973 - 2354 2974 2973 2955 2953 - 2322 2925 2940 2973 2972 - 2344 2949 3051 2952 3050 - 2341 2944 2942 2943 2949 - 2343 2942 3053 2949 3051 - 2340 2943 2949 2951 2952 - 2339 2971 2970 2941 2944 - 2339 2972 2971 2966 2941 - 2319 2918 2919 2971 2970 - 2318 2940 2918 2972 2971 - 2337 2948 2943 2950 2951 - 2336 2941 2944 2948 2943 - 2348 2968 2948 2967 2950 - 2338 2966 2941 2968 2948 - 2343 2969 3054 2942 3053 - 2342 2970 2969 2944 2942 - 2315 2917 3052 2969 3054 - 2315 2919 2917 2970 2969 - 2469 3084 3110 3091 3109 - 2471 3085 3084 3092 3091 - 2471 3199 3198 3085 3084 - 2470 3198 3201 3084 3110 - 2361 2956 2958 3197 3199 - 2363 2960 2959 2956 2958 - 2362 3227 2960 3226 2956 - 2360 3226 2956 3225 3197 - 2467 3083 3085 3090 3092 - 2465 3224 3083 3223 3090 - 2467 3197 3199 3083 3085 - 2466 3225 3197 3224 3083 - 2365 2957 2965 3198 3201 - 2366 2959 2963 2958 2957 - 2367 2963 2967 2957 2965 - 2364 2958 2957 3199 3198 - 2369 2988 2976 2978 2977 - 2371 2989 2988 2980 2978 - 2368 2990 2985 2988 2976 - 2381 2984 2990 2989 2988 - 2377 2981 2983 2987 2984 - 2379 3913 3906 3003 2981 - 2378 3906 3907 2981 2983 - 2379 3003 2981 3004 2987 - 2375 2986 2989 2979 2980 - 2373 3004 2987 3005 2986 - 2372 3005 2986 3006 2979 - 2376 2987 2984 2986 2989 - 2383 2982 42 2990 2985 - 2383 3909 3908 2982 42 - 2380 2983 2982 2984 2990 - 2382 3907 3909 2983 2982 - 2915 3645 3670 3914 3913 - 2915 3643 3672 3645 3670 - 2919 3646 3645 3912 3914 - 2914 3652 3643 3646 3645 - 2927 3648 3654 3653 3652 - 2925 3690 3650 3689 3648 - 2926 3689 3648 3688 3653 - 2927 3650 3649 3648 3654 - 2919 3644 3646 3911 3912 - 2917 3687 3644 3910 3911 - 2916 3688 3653 3687 3644 - 2918 3653 3652 3644 3646 - 2913 3651 3673 3643 3672 - 2912 3654 3651 3652 3643 - 2923 3649 3647 3654 3651 - 2923 3647 3671 3651 3673 - 2939 3660 3659 3909 3908 - 2938 3669 3668 3662 3660 - 2939 3662 3660 3907 3909 - 2937 3668 3658 3660 3659 - 2943 3667 3666 3665 3669 - 2935 3656 3657 3667 3666 - 2933 3673 3667 3672 3665 - 2932 3671 3656 3673 3667 - 2936 3664 3663 3668 3658 - 2929 3666 3664 3669 3668 - 2931 3655 41 3664 3663 - 2928 3657 3655 3666 3664 - 2941 3661 3662 3906 3907 - 2940 3670 3661 3913 3906 - 2943 3665 3669 3661 3662 - 2942 3672 3665 3670 3661 - 2393 3002 3005 2995 3006 - 2394 3001 3002 2997 2995 - 2392 3000 3004 3002 3005 - 2389 2994 3000 3001 3002 - 2399 2999 3001 2996 2997 - 2397 3048 2999 3052 2996 - 2396 3047 2998 3048 2999 - 2387 2998 2994 2999 3001 - 2387 2991 2993 2998 2994 - 2386 3046 2991 3047 2998 - 2385 3911 3912 2991 2993 - 2384 3910 3911 3046 2991 - 2391 2992 3003 3000 3004 - 2388 2993 2992 2994 3000 - 2391 3914 3913 2992 3003 - 2390 3912 3914 2993 2992 - 2313 2924 3048 2917 3052 - 2314 2923 2924 2919 2917 - 2312 2922 3047 2924 3048 - 2309 2916 2922 2923 2924 - 2311 2914 3046 2922 3047 - 2310 3899 3900 2915 2914 - 2308 2915 2914 2916 2922 - 2311 3900 3910 2914 3046 - 2319 2921 2923 2918 2919 - 2307 2920 2916 2921 2923 - 2317 2939 2921 2940 2918 - 2316 2938 2920 2939 2921 - 2307 2913 2915 2920 2916 - 2306 2937 2913 2938 2920 - 2305 3904 3898 2937 2913 - 2304 3898 3899 2913 2915 - 2859 3580 3603 3905 3904 - 2858 3582 3580 3903 3905 - 2857 3583 3588 3582 3580 - 2859 3588 3604 3580 3603 - 2863 3587 3586 3585 3583 - 2851 3881 3587 3880 3585 - 2851 3879 3577 3881 3587 - 2850 3577 3579 3587 3586 - 2861 3581 3582 3902 3903 - 2863 3585 3583 3581 3582 - 2862 3880 3585 3878 3581 - 2860 3878 3581 3901 3902 - 2853 3584 3602 3588 3604 - 2855 3579 3578 3586 3584 - 2856 3586 3584 3583 3588 - 2852 3578 3601 3584 3602 - 2321 2934 2939 2925 2940 - 2323 2935 2934 2927 2925 - 2333 2931 2936 2935 2934 - 2320 2936 2938 2934 2939 - 2335 2929 2937 2936 2938 - 2335 3905 3904 2929 2937 - 2334 3903 3905 2930 2929 - 2332 2930 2929 2931 2936 - 2329 2928 2930 2933 2931 - 2331 3902 3903 2928 2930 - 2330 3222 2928 3221 2933 - 2331 3901 3902 3222 2928 - 2326 2932 2935 2926 2927 - 2325 3220 2932 3231 2926 - 2328 2933 2931 2932 2935 - 2324 3221 2933 3220 2932 - 2875 3593 3687 3900 3910 - 2875 3595 3593 3899 3900 - 2874 3600 3599 3595 3593 - 2873 3599 3688 3593 3687 - 2879 3594 3595 3898 3899 - 2877 3604 3598 3603 3594 - 2878 3598 3600 3594 3595 - 2879 3603 3594 3904 3898 - 2872 3597 3689 3599 3688 - 2871 3592 3590 3591 3597 - 2870 3591 3597 3600 3599 - 2871 3590 3690 3597 3689 - 2867 3596 3591 3598 3600 - 2876 3602 3596 3604 3598 - 2867 3589 3592 3596 3591 - 2866 3601 3589 3602 3596 - 2631 3365 3864 3304 3879 - 2631 3364 3365 3305 3304 - 2679 3344 3863 3365 3864 - 2679 3345 3344 3364 3365 - 2673 3353 3350 3343 3345 - 2685 3358 3354 3357 3353 - 2675 3357 3353 3356 3343 - 2672 3354 3352 3353 3350 - 2677 3349 3862 3344 3863 - 2676 3351 3866 3349 3862 - 2678 3350 3349 3345 3344 - 2681 3352 3351 3350 3349 - 2627 3362 3364 3303 3305 - 2675 3356 3343 3363 3362 - 2626 3363 3362 3330 3303 - 2674 3343 3345 3362 3364 - 2683 3346 3867 3351 3866 - 2682 3547 3545 3348 3346 - 2680 3348 3346 3352 3351 - 2683 3545 3868 3346 3867 - 2763 3457 3459 3548 3547 - 2762 3464 3463 3457 3459 - 2763 3480 3457 3546 3548 - 2761 3479 3464 3480 3457 - 2767 3458 3865 3545 3868 - 2765 3462 3870 3458 3865 - 2767 3459 3458 3547 3545 - 2766 3463 3462 3459 3458 - 2687 3347 3348 3354 3352 - 2684 3355 3347 3358 3354 - 2687 3548 3547 3347 3348 - 2686 3546 3548 3355 3347 - 2655 3361 3363 3320 3330 - 2655 3360 3361 3321 3320 - 2663 3332 3356 3361 3363 - 2663 3333 3332 3360 3361 - 2661 3339 3357 3332 3356 - 2662 3340 3339 3333 3332 - 2665 3342 3341 3340 3339 - 2660 3341 3358 3339 3357 - 2657 3337 3340 3331 3333 - 2659 3374 3337 3373 3331 - 2669 3372 3338 3374 3337 - 2656 3338 3342 3337 3340 - 2651 3359 3360 3319 3321 - 2658 3331 3333 3359 3360 - 2659 3373 3331 3371 3359 - 2650 3371 3359 3370 3319 - 2667 3334 3355 3341 3358 - 2667 3544 3546 3334 3355 - 2666 3543 3544 3336 3334 - 2664 3336 3334 3342 3341 - 2779 3468 3480 3544 3546 - 2777 3470 3476 3471 3468 - 2778 3476 3479 3468 3480 - 2779 3471 3468 3543 3544 - 2671 3335 3336 3338 3342 - 2670 3541 3542 3369 3335 - 2668 3369 3335 3372 3338 - 2671 3542 3543 3335 3336 - 2783 3469 3471 3542 3543 - 2782 3532 3475 3531 3469 - 2781 3475 3470 3469 3471 - 2783 3531 3469 3541 3542 - 2764 3461 3871 3462 3870 - 2757 3456 3454 3455 3461 - 2759 3454 3872 3461 3871 - 2756 3455 3461 3463 3462 - 2759 3522 3869 3454 3872 - 2758 3521 3522 3456 3454 - 2811 3503 3873 3522 3869 - 2811 3505 3503 3521 3522 - 2755 3460 3455 3464 3463 - 2755 3453 3456 3460 3455 - 2760 3478 3460 3479 3464 - 2754 3477 3453 3478 3460 - 2753 3520 3521 3453 3456 - 2752 3519 3520 3477 3453 - 2815 3504 3505 3520 3521 - 2814 3515 3504 3519 3520 - 2809 3510 3874 3503 3873 - 2805 3502 3508 3509 3510 - 2810 3509 3510 3505 3503 - 2808 3508 3876 3510 3874 - 2815 3507 3509 3504 3505 - 2813 3514 3507 3515 3504 - 2803 3506 3502 3507 3509 - 2812 3513 3506 3514 3507 - 2803 3497 3500 3506 3502 - 2801 3512 3496 3511 3497 - 2800 3496 3501 3497 3500 - 2802 3511 3497 3513 3506 - 2807 3499 3877 3508 3876 - 2806 3501 3498 3500 3499 - 2804 3500 3499 3502 3508 - 2807 3498 3875 3499 3877 - 2799 3495 3514 3488 3515 - 2791 3493 3513 3495 3514 - 2797 3494 3493 3490 3495 - 2796 3490 3495 3489 3488 - 2791 3484 3511 3493 3513 - 2789 3483 3512 3484 3511 - 2788 3486 3483 3485 3484 - 2790 3485 3484 3494 3493 - 2795 3492 3490 3487 3489 - 2785 3530 3491 3529 3492 - 2794 3491 3494 3492 3490 - 2795 3529 3492 3528 3487 - 2787 3482 3485 3491 3494 - 2786 3527 3481 3526 3482 - 2787 3481 3486 3482 3485 - 2784 3526 3482 3530 3491 - 2773 3473 3478 3476 3479 - 2776 3474 3473 3470 3476 - 2775 3467 3466 3474 3473 - 2772 3466 3477 3473 3478 - 2780 3472 3474 3475 3470 - 2769 3525 3465 3524 3472 - 2771 3465 3467 3472 3474 - 2768 3524 3472 3532 3475 - 2771 3518 3517 3465 3467 - 2793 3487 3489 3518 3517 - 2792 3528 3487 3523 3518 - 2770 3523 3518 3525 3465 - 2774 3516 3519 3466 3477 - 2775 3517 3516 3467 3466 - 2799 3488 3515 3516 3519 - 2798 3489 3488 3517 3516 - 2579 3302 3371 3247 3370 - 2578 3301 3302 3250 3247 - 2623 3283 3373 3302 3371 - 2622 3285 3283 3301 3302 - 2583 3300 3301 3248 3250 - 2619 3282 3285 3300 3301 - 2619 3294 3282 3299 3300 - 2583 3299 3300 3262 3248 - 2617 3290 3284 3282 3285 - 2609 3293 3289 3292 3290 - 2618 3292 3290 3294 3282 - 2616 3289 3288 3290 3284 - 2623 3287 3374 3283 3373 - 2621 3288 3286 3284 3287 - 2615 3286 3372 3287 3374 - 2620 3284 3287 3285 3283 - 2593 3263 3291 3275 3293 - 2595 3266 3263 3276 3275 - 2594 3537 3536 3263 3291 - 2595 3539 3537 3266 3263 - 2597 3265 3266 3272 3276 - 2599 3538 3540 3264 3265 - 2598 3264 3265 3273 3272 - 2599 3540 3539 3265 3266 - 2747 3432 3434 3540 3539 - 2746 3431 3432 3538 3540 - 2745 3430 3440 3431 3432 - 2747 3440 3439 3432 3434 - 2751 3433 3444 3537 3536 - 2751 3434 3433 3539 3537 - 2749 3438 3443 3433 3444 - 2750 3439 3438 3434 3433 - 2615 3280 3369 3286 3372 - 2613 3534 3535 3281 3280 - 2612 3535 3541 3280 3369 - 2614 3281 3280 3288 3286 - 2611 3279 3281 3289 3288 - 2610 3536 3533 3291 3279 - 2608 3291 3279 3293 3289 - 2611 3533 3534 3279 3281 - 2731 3416 3531 3535 3541 - 2729 3418 3424 3419 3416 - 2730 3424 3532 3416 3531 - 2731 3419 3416 3534 3535 - 2735 3417 3419 3533 3534 - 2733 3423 3418 3417 3419 - 2734 3443 3423 3444 3417 - 2735 3444 3417 3536 3533 - 2563 3297 3299 3232 3262 - 2562 3298 3297 3236 3232 - 2607 3271 3270 3298 3297 - 2607 3270 3294 3297 3299 - 2606 3278 3292 3270 3294 - 2605 3277 3278 3271 3270 - 2604 3276 3275 3277 3278 - 2592 3275 3293 3278 3292 - 2603 3274 3277 3269 3271 - 2601 3273 3272 3267 3274 - 2596 3272 3276 3274 3277 - 2600 3267 3274 3268 3269 - 2567 3295 3298 3234 3236 - 2567 3296 3295 3233 3234 - 2602 3268 3269 3296 3295 - 2603 3269 3271 3295 3298 - 2725 3421 3524 3424 3532 - 2728 3422 3421 3418 3424 - 2727 3415 3414 3422 3421 - 2724 3414 3525 3421 3524 - 2732 3420 3422 3423 3418 - 2723 3413 3415 3420 3422 - 2721 3441 3413 3442 3420 - 2720 3442 3420 3443 3423 - 2723 3452 3451 3413 3415 - 2722 3450 3452 3441 3413 - 2695 3412 3376 3450 3452 - 2695 3376 3377 3452 3451 - 2726 3449 3523 3414 3525 - 2691 3375 3528 3449 3523 - 2691 3377 3375 3451 3449 - 2727 3451 3449 3415 3414 - 2748 3437 3442 3438 3443 - 2743 3429 3427 3428 3437 - 2742 3428 3437 3439 3438 - 2743 3427 3441 3437 3442 - 2741 3448 3450 3427 3441 - 2707 3394 3390 3447 3448 - 2740 3447 3448 3429 3427 - 2706 3390 3412 3448 3450 - 2739 3446 3447 3426 3429 - 2739 3445 3446 3425 3426 - 2711 3393 3394 3446 3447 - 2710 3392 3393 3445 3446 - 2737 3435 3428 3440 3439 - 2736 3426 3429 3435 3428 - 2738 3425 3426 3436 3435 - 2744 3436 3435 3430 3440 - 2689 3387 3529 3375 3528 - 2701 3384 3389 3388 3387 - 2688 3389 3530 3387 3529 - 2690 3388 3387 3377 3375 - 2703 3381 3526 3389 3530 - 2703 3380 3527 3381 3526 - 2702 3383 3380 3382 3381 - 2700 3382 3381 3384 3389 - 2694 3385 3388 3376 3377 - 2693 3411 3385 3412 3376 - 2699 3386 3384 3385 3388 - 2692 3410 3386 3411 3385 - 2699 3379 3382 3386 3384 - 2697 3378 3383 3379 3382 - 2696 3408 3378 3409 3379 - 2698 3409 3379 3410 3386 - 2705 3405 3411 3390 3412 - 2704 3407 3410 3405 3411 - 2707 3406 3405 3394 3390 - 2717 3401 3407 3406 3405 - 2713 3396 3399 3404 3401 - 2715 40 3395 39 3396 - 2714 3395 3400 3396 3399 - 2715 39 3396 3403 3404 - 2719 3398 3409 3407 3410 - 2718 3400 3397 3399 3398 - 2719 3397 3408 3398 3409 - 2716 3399 3398 3401 3407 - 2711 3402 3406 3393 3394 - 2709 3391 3402 3392 3393 - 2708 3403 3404 3391 3402 - 2712 3404 3401 3402 3406 - 2293 2866 3224 2873 3223 - 2295 2912 3225 2866 3224 - 2294 2867 2866 2874 2873 - 2295 2911 2912 2867 2866 - 2171 2732 3226 2912 3225 - 2170 2735 2732 2911 2912 - 2171 2740 3227 2732 3226 - 2169 2734 2740 2735 2732 - 2175 2733 2735 2909 2911 - 2173 2739 2734 2733 2735 - 2175 2744 2733 2910 2909 - 2174 2743 2739 2744 2733 - 2291 2865 2867 2872 2874 - 2290 2910 2909 2880 2865 - 2291 2909 2911 2865 2867 - 2289 2880 2865 2879 2872 - 2167 2737 3228 2740 3227 - 2168 2738 2737 2734 2740 - 2167 2730 3229 2737 3228 - 2166 2731 2730 2738 2737 - 2165 2751 3230 2730 3229 - 2117 2692 2690 2750 2751 - 2164 2750 2751 2731 2730 - 2119 2690 3231 2751 3230 - 2172 2736 2738 2739 2734 - 2163 2742 2736 2743 2739 - 2163 2741 2729 2742 2736 - 2162 2729 2731 2736 2738 - 2161 2749 2750 2729 2731 - 2160 2748 2749 2741 2729 - 2116 2691 2692 2749 2750 - 2113 2716 2691 2748 2749 - 2277 2854 2880 2861 2879 - 2279 2908 2910 2854 2880 - 2279 2907 2908 2855 2854 - 2278 2855 2854 2862 2861 - 2275 2853 2855 2860 2862 - 2273 2896 2853 2895 2860 - 2275 2906 2907 2853 2855 - 2274 2905 2906 2896 2853 - 2155 2721 2744 2908 2910 - 2154 2728 2727 2723 2721 - 2155 2723 2721 2907 2908 - 2153 2727 2743 2721 2744 - 2159 2722 2723 2906 2907 - 2158 2760 2722 2905 2906 - 2159 2726 2728 2722 2723 - 2157 2759 2726 2760 2722 - 2152 2725 2742 2727 2743 - 2151 2720 2718 2719 2725 - 2150 2719 2725 2728 2727 - 2151 2718 2741 2725 2742 - 2149 2747 2748 2718 2741 - 2131 2703 2701 2746 2747 - 2130 2701 2716 2747 2748 - 2148 2746 2747 2720 2718 - 2147 2745 2746 2717 2720 - 2135 2758 2702 2757 2745 - 2147 2757 2745 2756 2717 - 2135 2702 2703 2745 2746 - 2145 2724 2719 2726 2728 - 2156 2755 2724 2759 2726 - 2146 2756 2717 2755 2724 - 2144 2717 2720 2724 2719 - 2119 2698 3220 2690 3231 - 2123 2700 2693 2699 2698 - 2122 2693 3221 2698 3220 - 2118 2699 2698 2692 2690 - 2115 2689 2699 2691 2692 - 2115 2697 2700 2689 2699 - 2112 2715 2689 2716 2691 - 2114 2714 2697 2715 2689 - 2121 2695 3222 2693 3221 - 2125 3896 3897 2696 2695 - 2123 2696 2695 2700 2693 - 2120 3897 3901 2695 3222 - 2127 2694 2696 2697 2700 - 2124 3895 3896 2694 2696 - 2127 2713 2694 2714 2697 - 2126 3894 3895 2713 2694 - 2129 2710 2715 2701 2716 - 2128 2712 2714 2710 2715 - 2141 2707 2712 2711 2710 - 2131 2711 2710 2703 2701 - 2143 2705 2713 2712 2714 - 2143 3893 3894 2705 2713 - 2140 2706 2705 2707 2712 - 2142 3892 3893 2706 2705 - 2137 2704 2706 2709 2707 - 2139 3890 3891 2754 2704 - 2138 2754 2704 2753 2709 - 2139 3891 3892 2704 2706 - 2134 2708 2711 2702 2703 - 2136 2709 2707 2708 2711 - 2133 2752 2708 2758 2702 - 2132 2753 2709 2752 2708 - 2639 3307 3878 3897 3901 - 2638 3314 3880 3307 3878 - 2639 3309 3307 3896 3897 - 2637 3308 3314 3309 3307 - 2629 3312 3881 3314 3880 - 2628 3304 3879 3312 3881 - 2630 3305 3304 3313 3312 - 2636 3313 3312 3308 3314 - 2635 3306 3309 3895 3896 - 2633 3311 3308 3306 3309 - 2635 3329 3306 3894 3895 - 2634 3328 3311 3329 3306 - 2632 3310 3313 3311 3308 - 2625 3327 3310 3328 3311 - 2624 3330 3303 3327 3310 - 2627 3303 3305 3310 3313 - 2647 3316 3329 3893 3894 - 2646 3318 3316 3892 3893 - 2645 3317 3325 3318 3316 - 2647 3325 3328 3316 3329 - 2653 3326 3327 3325 3328 - 2654 3321 3320 3324 3326 - 2652 3320 3330 3326 3327 - 2644 3324 3326 3317 3325 - 2643 3315 3318 3891 3892 - 2642 3368 3322 3367 3315 - 2643 3367 3315 3890 3891 - 2641 3322 3317 3315 3318 - 2640 3323 3324 3322 3317 - 2649 3370 3319 3366 3323 - 2651 3319 3321 3323 3324 - 2648 3366 3323 3368 3322 - 2229 2803 2896 2810 2895 - 2231 2904 2903 2804 2803 - 2230 2804 2803 2811 2810 - 2231 2903 2905 2803 2896 - 2091 2654 2656 2902 2904 - 2090 2661 2660 2654 2656 - 2089 2679 2661 2677 2654 - 2091 2677 2654 2901 2902 - 2227 2802 2804 2809 2811 - 2227 2902 2904 2802 2804 - 2225 2817 2802 2816 2809 - 2226 2901 2902 2817 2802 - 2095 2655 2760 2903 2905 - 2093 2659 2759 2655 2760 - 2094 2660 2659 2656 2655 - 2095 2656 2655 2904 2903 - 2099 2675 2680 2673 2679 - 2099 2662 2678 2675 2680 - 2098 2665 2662 2676 2675 - 2107 2676 2675 2674 2673 - 2103 2671 2676 2670 2674 - 2103 2664 2665 2671 2676 - 2102 2663 2664 2672 2671 - 2111 2672 2671 38 2670 - 2097 2688 2687 2662 2678 - 2075 2639 2635 2686 2688 - 2074 2635 2649 2688 2687 - 2096 2686 2688 2665 2662 - 2101 2685 2686 2664 2665 - 2079 2638 2639 2685 2686 - 2078 2637 2638 2684 2685 - 2100 2684 2685 2663 2664 - 2213 2791 2817 2798 2816 - 2215 2900 2899 2792 2791 - 2214 2899 2901 2791 2817 - 2215 2792 2791 2799 2798 - 2105 2666 2677 2899 2901 - 2106 2673 2679 2666 2677 - 2104 2669 2666 2900 2899 - 2107 2674 2673 2669 2666 - 2211 2790 2792 2797 2799 - 2210 2898 2897 2789 2790 - 2209 2789 2790 5281 2797 - 2211 2897 2900 2790 2792 - 2109 2667 2669 2897 2900 - 2108 2668 2667 2898 2897 - 2111 38 2670 2668 2667 - 2110 2670 2674 2667 2669 - 2092 2658 2755 2659 2759 - 2085 2653 2651 2652 2658 - 2087 2651 2756 2658 2755 - 2084 2652 2658 2660 2659 - 2083 2683 2682 2650 2653 - 2063 2624 2625 2683 2682 - 2062 2649 2624 2687 2683 - 2083 2687 2683 2678 2650 - 2081 2657 2652 2661 2660 - 2088 2680 2657 2679 2661 - 2082 2678 2650 2680 2657 - 2080 2650 2653 2657 2652 - 2087 2681 2757 2651 2756 - 2086 2682 2681 2653 2651 - 2059 2625 2623 2682 2681 - 2058 2623 2758 2681 2757 - 2057 2630 2752 2623 2758 - 2056 2629 2753 2630 2752 - 2059 2628 2630 2625 2623 - 2049 2622 2629 2628 2630 - 2063 2627 2628 2624 2625 - 2061 2647 2626 2648 2627 - 2060 2648 2627 2649 2624 - 2055 2626 2622 2627 2628 - 2051 2619 2754 2629 2753 - 2050 3883 3884 2621 2619 - 2048 2621 2619 2622 2629 - 2051 3884 3890 2619 2754 - 2055 2620 2621 2626 2622 - 2054 2646 2620 2647 2626 - 2053 3888 3882 2646 2620 - 2052 3882 3883 2620 2621 - 2073 2645 2648 2635 2649 - 2072 2644 2647 2645 2648 - 2075 2643 2645 2639 2635 - 2069 2634 2644 2643 2645 - 2071 2632 2646 2644 2647 - 2070 3887 3889 2633 2632 - 2071 3889 3888 2632 2646 - 2068 2633 2632 2634 2644 - 2065 2631 2633 2640 2634 - 2067 3885 3886 37 2631 - 2066 3886 3887 2631 2633 - 2067 37 2631 2641 2640 - 2079 2642 2643 2638 2639 - 2077 2641 2640 2636 2642 - 2076 2636 2642 2637 2638 - 2064 2640 2634 2642 2643 - 2571 3237 3259 3889 3888 - 2570 3246 3245 3240 3237 - 2569 3245 3260 3237 3259 - 2571 3240 3237 3887 3889 - 2565 3243 3235 3242 3246 - 2566 3233 3234 3244 3243 - 2575 3244 3243 36 3242 - 2564 3234 3236 3243 3235 - 2568 3241 3261 3245 3260 - 2563 3232 3262 3241 3261 - 2561 3236 3232 3235 3241 - 2560 3235 3241 3246 3245 - 2573 3239 3240 3886 3887 - 2572 3238 3239 3885 3886 - 2574 3242 3246 3239 3240 - 2575 36 3242 3238 3239 - 2587 3251 3367 3884 3890 - 2586 3258 3257 3253 3251 - 2587 3253 3251 3883 3884 - 2585 3257 3368 3251 3367 - 2581 3256 3249 3255 3258 - 2582 3262 3248 3261 3256 - 2591 3261 3256 3260 3255 - 2580 3248 3250 3256 3249 - 2589 3252 3253 3882 3883 - 2588 3259 3252 3888 3882 - 2590 3255 3258 3252 3253 - 2591 3260 3255 3259 3252 - 2584 3254 3366 3257 3368 - 2579 3247 3370 3254 3366 - 2577 3250 3247 3249 3254 - 2576 3249 3254 3258 3257 - 2931 3681 3680 3655 41 - 2911 3632 3630 3679 3681 - 2911 3630 3629 3681 3680 - 2930 3679 3681 3657 3655 - 2935 3678 3679 3656 3657 - 2907 3642 3628 3677 3678 - 2934 3677 3678 3671 3656 - 2906 3628 3632 3678 3679 - 2905 3638 3631 3628 3632 - 2904 3637 3636 3638 3631 - 2907 3641 3638 3642 3628 - 2897 3640 3637 3641 3638 - 2910 3635 3634 3630 3629 - 2909 3636 3633 3631 3635 - 2908 3631 3635 3632 3630 - 2903 3633 35 3635 3634 - 2881 3612 3639 3618 3640 - 2883 3614 3612 3619 3618 - 2882 3858 3857 3612 3639 - 2883 3860 3858 3614 3612 - 3035 3779 3781 3861 3860 - 3034 3845 3779 3859 3861 - 3033 3844 3786 3845 3779 - 3035 3786 3785 3779 3781 - 2885 3613 3614 3621 3619 - 2887 3861 3860 3613 3614 - 2886 3686 3613 3685 3621 - 2887 3859 3861 3686 3613 - 3039 3780 3790 3858 3857 - 3039 3781 3780 3860 3858 - 3037 3784 3789 3780 3790 - 3038 3785 3784 3781 3780 - 2922 3676 3677 3647 3671 - 2895 3617 3616 3675 3676 - 2921 3675 3676 3649 3647 - 2895 3616 3642 3676 3677 - 2891 3623 3622 3615 3617 - 2884 3621 3619 3623 3622 - 2889 3685 3621 3684 3623 - 2888 3684 3623 3683 3615 - 2894 3620 3641 3616 3642 - 2893 3619 3618 3622 3620 - 2892 3622 3620 3617 3616 - 2880 3618 3640 3620 3641 - 2920 3674 3675 3650 3649 - 2891 3615 3617 3674 3675 - 2890 3683 3615 3682 3674 - 2924 3682 3674 3690 3650 - 2903 3626 3625 3633 35 - 2901 3856 3855 3626 3625 - 2900 3854 3856 3627 3626 - 2902 3627 3626 3636 3633 - 3019 3765 34 3856 3855 - 3018 3768 3765 3854 3856 - 3017 3767 3774 3768 3765 - 3019 3774 3773 3765 34 - 2899 3624 3627 3637 3636 - 2899 3853 3854 3624 3627 - 2898 3857 3853 3639 3624 - 2896 3639 3624 3640 3637 - 3023 3766 3768 3853 3854 - 3021 3772 3767 3766 3768 - 3023 3790 3766 3857 3853 - 3022 3789 3772 3790 3766 - 2977 3735 3836 3733 3844 - 2979 3727 3725 3736 3735 - 2991 3736 3735 3734 3733 - 2976 3725 3837 3735 3836 - 2983 3732 3736 3731 3734 - 2983 3726 3727 3732 3736 - 2982 3750 3726 3752 3732 - 2987 3752 3732 3751 3731 - 2978 3759 3838 3725 3837 - 2959 3700 3698 3758 3759 - 2979 3758 3759 3727 3725 - 2959 3698 3839 3759 3838 - 2981 3757 3758 3726 3727 - 2955 3697 3700 3757 3758 - 2955 3723 3697 3756 3757 - 2980 3756 3757 3750 3726 - 2963 3717 3724 3706 3723 - 2962 3719 3722 3717 3724 - 2963 3718 3717 3708 3706 - 2973 3714 3719 3718 3717 - 2969 3709 3713 3716 3714 - 2971 33 3712 3709 3713 - 2970 3877 3709 3876 3716 - 2971 3875 33 3877 3709 - 2975 3711 3721 3719 3722 - 2972 3713 3711 3714 3719 - 2974 3712 3710 3713 3711 - 2975 3710 3720 3711 3721 - 2967 3715 3718 3707 3708 - 2965 3876 3716 3874 3715 - 2968 3716 3714 3715 3718 - 2964 3874 3715 3873 3707 - 3007 3748 3752 3747 3751 - 2999 3739 3738 3745 3748 - 2998 3745 3748 3743 3747 - 2999 3738 3750 3748 3752 - 3003 3746 3745 3744 3743 - 2993 3872 3737 3871 3746 - 2995 3737 3739 3746 3745 - 2992 3871 3746 3870 3744 - 2995 3755 3754 3737 3739 - 2994 3869 3755 3872 3737 - 2967 3707 3708 3755 3754 - 2966 3873 3707 3869 3755 - 2997 3753 3756 3738 3750 - 2961 3708 3706 3754 3753 - 2960 3706 3723 3753 3756 - 2996 3754 3753 3739 3738 - 2958 3705 3840 3698 3839 - 2951 3703 3841 3705 3840 - 2957 3699 3705 3700 3698 - 2956 3704 3703 3699 3705 - 2951 3694 3842 3703 3841 - 2950 3696 3694 3704 3703 - 2949 3693 3843 3694 3842 - 2948 3695 3693 3696 3694 - 2947 3692 3696 3701 3704 - 2947 3691 3695 3692 3696 - 2946 3720 3691 3721 3692 - 2945 3721 3692 3722 3701 - 2953 3702 3699 3697 3700 - 2952 3701 3704 3702 3699 - 2954 3724 3702 3723 3697 - 2944 3722 3701 3724 3702 - 3013 3770 3761 3774 3773 - 3016 3771 3770 3767 3774 - 3015 3764 3763 3771 3770 - 3012 3763 3762 3770 3761 - 3011 3834 3833 3760 3764 - 3051 3826 3798 3835 3834 - 3010 3835 3834 3787 3760 - 3050 3798 3801 3834 3833 - 3020 3769 3771 3772 3767 - 3009 3787 3760 3788 3769 - 3008 3788 3769 3789 3772 - 3011 3760 3764 3769 3771 - 3014 3832 3831 3763 3762 - 3015 3833 3832 3764 3763 - 3055 3799 32 3832 3831 - 3054 3801 3799 3833 3832 - 3063 3820 3827 3809 3826 - 3069 3817 3822 3821 3820 - 3062 3822 3825 3820 3827 - 3063 3821 3820 3810 3809 - 3071 3814 3824 3822 3825 - 3068 3816 3814 3817 3822 - 3070 3815 3813 3816 3814 - 3071 3813 3823 3814 3824 - 3065 3812 3816 3819 3817 - 3067 3843 3811 3842 3812 - 3066 3811 3815 3812 3816 - 3067 3842 3812 3841 3819 - 3059 3818 3821 3808 3810 - 3064 3819 3817 3818 3821 - 3057 3840 3818 3839 3808 - 3056 3841 3819 3840 3818 - 3036 3783 3788 3784 3789 - 3031 3778 3776 3777 3783 - 3030 3777 3783 3785 3784 - 3031 3776 3787 3783 3788 - 3027 3830 3829 3775 3778 - 3027 3838 3830 3837 3775 - 3058 3839 3808 3838 3830 - 3059 3808 3810 3830 3829 - 3029 3828 3835 3776 3787 - 3028 3829 3828 3778 3776 - 3061 3809 3826 3828 3835 - 3060 3810 3809 3829 3828 - 3025 3782 3777 3786 3785 - 3024 3775 3778 3782 3777 - 3026 3837 3775 3836 3782 - 3032 3836 3782 3844 3786 - 3055 3807 3806 3799 32 - 3053 3805 3804 3800 3807 - 3052 3800 3807 3801 3799 - 3047 3804 31 3807 3806 - 3043 3792 3796 3802 3805 - 3043 3791 3797 3792 3796 - 3042 3823 3791 3824 3792 - 3041 3824 3792 3825 3802 - 3049 3803 3800 3798 3801 - 3048 3802 3805 3803 3800 - 3040 3825 3802 3827 3803 - 3051 3827 3803 3826 3798 - 3047 3795 3794 3804 31 - 3045 3797 3793 3796 3795 - 3044 3793 30 3795 3794 - 3046 3796 3795 3805 3804 - 2869 3611 3682 3590 3690 - 2868 3610 3611 3592 3590 - 2847 3567 3566 3610 3611 - 2846 3566 3683 3611 3682 - 2843 3572 3571 3565 3567 - 2839 3570 3564 3572 3571 - 2841 3576 3572 3575 3565 - 2840 3574 3570 3576 3572 - 2865 3609 3610 3589 3592 - 2864 3608 3609 3601 3589 - 2842 3575 3565 3608 3609 - 2843 3565 3567 3609 3610 - 2845 3569 3684 3566 3683 - 2847 3571 3569 3567 3566 - 2833 3564 3568 3571 3569 - 2844 3568 3685 3569 3684 - 2817 3549 3573 3557 3574 - 2819 3850 3852 3551 3549 - 2818 3852 3851 3549 3573 - 2819 3551 3549 3558 3557 - 3005 3741 3749 3852 3851 - 3007 3747 3751 3741 3749 - 3004 3742 3741 3850 3852 - 3006 3743 3747 3742 3741 - 2821 3550 3551 3555 3558 - 2823 3868 3849 3867 3550 - 2822 3867 3550 3866 3555 - 2823 3849 3850 3550 3551 - 3001 3740 3742 3849 3850 - 3000 3865 3740 3868 3849 - 3003 3744 3743 3740 3742 - 3002 3870 3744 3865 3740 - 2854 3607 3608 3578 3601 - 2831 3554 3553 3606 3607 - 2855 3606 3607 3579 3578 - 2831 3553 3575 3607 3608 - 2830 3560 3576 3553 3575 - 2816 3557 3574 3560 3576 - 2829 3559 3560 3554 3553 - 2828 3558 3557 3559 3560 - 2849 3605 3606 3577 3579 - 2848 3864 3605 3879 3577 - 2827 3552 3554 3605 3606 - 2826 3863 3552 3864 3605 - 2827 3556 3559 3552 3554 - 2820 3555 3558 3556 3559 - 2825 3866 3555 3862 3556 - 2824 3862 3556 3863 3552 - 2835 3561 3686 3568 3685 - 2835 3846 3859 3561 3686 - 2834 3847 3846 3563 3561 - 2832 3563 3561 3564 3568 - 2985 3728 3730 3848 3847 - 2986 3731 3734 3728 3730 - 2984 3749 3728 3851 3848 - 2987 3751 3731 3749 3728 - 2839 3562 3563 3570 3564 - 2837 3851 3848 3573 3562 - 2836 3848 3847 3562 3563 - 2838 3573 3562 3574 3570 - 2989 3729 3845 3846 3859 - 2991 3734 3733 3730 3729 - 2990 3733 3844 3729 3845 - 2988 3730 3729 3847 3846 - 1013 1246 1253 1245 1244 - 1015 1294 1247 1293 1246 - 1014 1293 1246 1292 1245 - 1015 1247 1254 1246 1253 - 867 1092 1290 1091 1294 - 869 1118 1116 1090 1092 - 871 1090 1092 1100 1091 - 868 1116 1291 1092 1290 - 865 1089 1293 1088 1292 - 866 1100 1091 1103 1089 - 864 1103 1089 1102 1088 - 867 1091 1294 1089 1293 - 1011 1243 1252 1247 1254 - 1010 1291 1261 1290 1243 - 1009 1261 1260 1243 1252 - 1011 1290 1243 1294 1247 - 885 1113 1112 1119 1118 - 895 1115 1114 1113 1112 - 887 1106 1115 1105 1113 - 884 1105 1113 1117 1119 - 886 1127 1105 1126 1117 - 887 1125 1106 1127 1105 - 859 1078 1125 1076 1127 - 858 1076 1127 1087 1126 - 881 1111 1110 1115 1114 - 883 1136 1135 1104 1111 - 880 1104 1111 1106 1115 - 891 1135 1134 1111 1110 - 882 1124 1104 1125 1106 - 863 1133 1132 1077 1124 - 883 1132 1136 1124 1104 - 863 1077 1124 1078 1125 - 879 1096 1103 1095 1102 - 877 1098 1096 1094 1095 - 878 1097 1101 1098 1096 - 879 1101 1100 1096 1103 - 876 1123 1098 1122 1094 - 875 1121 1097 1123 1098 - 843 1065 1121 1063 1123 - 842 1063 1123 1062 1122 - 875 1120 1093 1121 1097 - 873 1126 1117 1120 1093 - 847 1064 1120 1065 1121 - 846 1087 1126 1064 1120 - 871 1099 1090 1101 1100 - 874 1093 1099 1097 1101 - 870 1119 1118 1099 1090 - 872 1117 1119 1093 1099 - 997 1232 1238 1261 1260 - 999 1233 1239 1232 1238 - 999 1289 1233 1288 1232 - 998 1288 1232 1291 1261 - 995 1231 1240 1233 1239 - 995 1287 1231 1289 1233 - 994 1286 1278 1287 1231 - 993 1278 1277 1231 1240 - 889 1107 1287 1109 1289 - 891 1134 1131 1110 1107 - 890 1110 1107 1114 1109 - 888 1131 1286 1107 1287 - 893 1108 1288 1116 1291 - 892 1109 1289 1108 1288 - 894 1112 1108 1118 1116 - 895 1114 1109 1112 1108 - 815 1033 1027 1130 1133 - 814 1032 1033 1129 1130 - 805 1025 1031 1032 1033 - 815 1031 1028 1033 1027 - 807 1023 1032 1128 1129 - 807 1346 1023 1345 1128 - 806 1344 1024 1346 1023 - 804 1024 1025 1023 1032 - 801 1022 1029 1024 1025 - 803 1343 1022 1344 1024 - 803 1342 1046 1343 1022 - 802 1046 1047 1022 1029 - 811 1030 1026 1031 1028 - 809 1049 1048 1030 1026 - 800 1029 1030 1025 1031 - 808 1047 1049 1029 1030 - 487 619 1346 661 1345 - 486 621 1344 619 1346 - 487 628 619 660 661 - 485 620 621 628 619 - 495 629 628 659 660 - 495 623 629 658 659 - 484 627 620 629 628 - 494 624 627 623 629 - 483 626 625 627 620 - 491 622 626 624 627 - 489 642 643 622 626 - 488 643 644 626 625 - 481 618 1343 621 1344 - 483 625 618 620 621 - 480 645 1342 618 1343 - 482 644 645 625 618 - 823 1044 1035 1049 1048 - 831 1045 1044 1047 1049 - 821 1037 1036 1044 1035 - 820 1041 1037 1045 1044 - 817 1042 1034 1037 1036 - 819 1313 1312 1042 1034 - 816 1043 1042 1041 1037 - 825 1311 1313 1043 1042 - 831 1039 1045 1046 1047 - 829 1341 1039 1342 1046 - 828 1340 1040 1341 1039 - 830 1040 1041 1039 1045 - 827 1038 1043 1040 1041 - 827 1339 1038 1340 1040 - 824 1310 1311 1038 1043 - 826 1338 1310 1339 1038 - 507 633 1341 645 1342 - 506 636 1340 633 1341 - 505 635 636 641 633 - 507 641 633 644 645 - 511 634 1339 636 1340 - 510 697 696 640 634 - 511 696 1338 634 1339 - 509 640 634 635 636 - 497 638 641 643 644 - 504 639 635 638 641 - 499 632 639 630 638 - 496 630 638 642 643 - 508 637 640 639 635 - 503 631 637 632 639 - 501 695 697 637 640 - 500 694 695 631 637 - 949 1178 1185 1278 1277 - 951 1179 1186 1178 1185 - 951 1285 1179 1284 1178 - 950 1284 1178 1286 1278 - 795 1009 1284 1131 1286 - 793 1011 1012 1017 1009 - 794 1017 1009 1134 1131 - 795 1012 1285 1009 1284 - 947 1177 1184 1179 1186 - 947 1283 1177 1285 1179 - 945 1192 1191 1177 1184 - 946 1282 1192 1283 1177 - 799 1010 1283 1012 1285 - 798 1021 1020 1016 1010 - 797 1016 1010 1011 1012 - 799 1020 1282 1010 1283 - 791 1014 1017 1135 1134 - 792 1015 1011 1014 1017 - 791 1007 1014 1136 1135 - 790 1008 1015 1007 1014 - 796 1013 1016 1015 1011 - 787 1019 1021 1013 1016 - 787 1018 1019 1006 1013 - 786 1006 1013 1008 1015 - 789 1056 1007 1132 1136 - 813 1027 1056 1133 1132 - 788 1055 1008 1056 1007 - 812 1028 1055 1027 1056 - 785 1054 1006 1055 1008 - 784 1053 1018 1054 1006 - 811 1026 1054 1028 1055 - 810 1048 1053 1026 1054 - 933 1166 1173 1192 1191 - 935 1281 1166 1282 1192 - 935 1280 1167 1281 1166 - 934 1167 1174 1166 1173 - 931 1165 1172 1167 1174 - 930 1309 1308 1279 1165 - 931 1279 1165 1280 1167 - 929 1308 1307 1165 1172 - 771 997 1279 996 1280 - 775 1306 1309 997 1279 - 775 1305 1306 995 997 - 774 995 997 1003 996 - 769 994 1281 1020 1282 - 770 1003 996 1005 994 - 768 1005 994 1021 1020 - 771 996 1280 994 1281 - 779 998 1005 1019 1021 - 777 1000 998 1018 1019 - 779 1004 1003 998 1005 - 778 1001 1004 1000 998 - 781 1052 999 1051 1001 - 818 1034 1052 1036 1051 - 783 1304 1303 1052 999 - 819 1312 1304 1034 1052 - 776 1050 1000 1053 1018 - 780 1051 1001 1050 1000 - 823 1035 1050 1048 1053 - 822 1036 1051 1035 1050 - 773 1002 995 1004 1003 - 782 999 1002 1001 1004 - 783 1303 1302 999 1002 - 772 1302 1305 1002 995 - 841 1071 1063 1061 1062 - 843 1070 1065 1071 1063 - 840 1069 1071 1068 1061 - 837 1060 1070 1069 1071 - 847 1067 1064 1070 1065 - 845 1085 1086 1066 1067 - 835 1066 1067 1060 1070 - 844 1086 1087 1067 1064 - 835 1057 1066 1059 1060 - 833 1337 1057 1336 1059 - 834 1084 1085 1057 1066 - 832 1335 1084 1337 1057 - 839 1058 1069 29 1068 - 836 1059 1060 1058 1069 - 839 1334 1058 1333 29 - 838 1336 1059 1334 1058 - 395 525 1334 28 1333 - 393 527 528 534 525 - 394 528 1336 525 1334 - 395 534 525 533 28 - 399 526 1337 528 1336 - 397 532 526 527 528 - 399 550 1335 526 1337 - 398 549 550 532 526 - 396 531 532 530 527 - 387 547 548 520 531 - 386 548 549 531 532 - 387 520 531 524 530 - 389 529 534 521 533 - 392 530 527 529 534 - 391 524 530 523 529 - 388 523 529 522 521 - 857 1083 1076 1086 1087 - 859 1082 1078 1083 1076 - 856 1081 1083 1085 1086 - 849 1075 1082 1081 1083 - 862 1080 1077 1082 1078 - 861 1130 1133 1080 1077 - 860 1129 1130 1079 1080 - 855 1079 1080 1075 1082 - 851 1072 1081 1084 1085 - 848 1074 1075 1072 1081 - 850 1332 1074 1331 1072 - 851 1331 1072 1335 1084 - 855 1073 1079 1074 1075 - 854 1128 1129 1073 1079 - 853 1345 1128 1330 1073 - 852 1330 1073 1332 1074 - 407 536 1331 550 1335 - 407 546 536 549 550 - 406 544 538 546 536 - 405 538 1332 536 1331 - 404 537 1330 538 1332 - 403 535 537 544 538 - 401 661 1345 537 1330 - 400 660 661 535 537 - 415 540 546 548 549 - 413 542 540 547 548 - 415 545 544 540 546 - 414 541 545 542 540 - 403 543 535 545 544 - 402 659 660 543 535 - 411 539 543 541 545 - 411 658 659 539 543 - 3283 4178 4128 4177 4119 - 719 917 911 916 912 - 1675 2586 2105 2087 2091 - 1443 1790 1795 1797 1798 - 663 861 858 977 976 - 2555 3175 3174 3166 27 - 1429 1898 1770 1897 1769 - 1841 2256 2258 2279 2504 - 251 313 323 318 317 - 1625 2017 2025 2024 2023 - 3885 4873 4872 4845 4866 - 1231 1542 1548 1543 1547 - 641 835 993 837 992 - 3175 5119 5124 5118 4181 - 523 755 701 758 723 - 3401 4265 4266 4270 4264 - 1717 2126 2120 2121 2519 - 1245 1553 1594 1611 1610 - 3999 5072 4963 5071 5000 - 3287 4127 4120 4122 4121 - 2041 2486 2464 2462 2463 - 4019 4983 4987 4982 4988 - 635 813 806 1295 1296 - 1283 1629 1627 1628 1626 - 1623 2016 2033 26 2013 - 3129 3973 3961 3972 3959 - 919 1150 2598 1272 2597 - 3479 4341 4350 4343 4342 - 3167 3998 3997 4496 4495 - 235 308 297 328 327 - 711 907 914 909 908 - 3613 4543 4579 4540 4578 - 2513 3184 3183 3133 3135 - 1195 1510 1509 1923 1924 - 3851 4832 4822 4833 4835 - 1183 1497 1625 1494 1624 - 1075 1391 1384 1398 1397 - 3947 4912 4908 4934 4933 - 231 294 304 296 303 - 3403 4269 4281 4283 4284 - 3121 3954 3965 3971 3964 - 3289 5122 4126 4123 4125 - 3571 4430 25 4446 4445 - 3793 4735 4777 4734 4776 - 1239 1550 1609 1608 1607 - 1503 1839 1902 1841 1901 - 1561 1969 1968 1965 1959 - 3207 5115 5117 5114 4101 - 498 652 630 651 642 - 1829 2253 2254 2249 2248 - 277 388 379 381 380 - 1129 1435 1437 1440 1439 - 3093 3929 3928 3935 3927 - 3567 4429 4428 4425 4424 - 1459 1800 1809 1801 1808 - 1767 2171 5193 2170 2165 - 3127 3982 3981 3958 3957 - 2253 2882 2885 2830 2852 - 3347 4518 4517 4516 4208 - 3911 4874 4877 4876 4875 - 493 650 623 657 658 - 492 649 624 650 623 - 467 604 649 602 650 - 7 108 48 112 75 - 2011 2432 2479 2431 2478 - 1131 1477 1476 1465 1475 - 1105 1615 1474 1618 1417 - 1421 1762 1765 1761 1764 - 51 101 92 184 185 - 3639 4558 4565 5153 5152 - 1091 1412 1460 1454 1461 - 2185 2765 2772 2771 2770 - 537 714 2606 716 2607 - 83 137 128 130 131 - 647 847 846 863 864 - 3881 4871 4870 4844 4846 - 4037 5014 5009 5015 5028 - 1043 1366 1400 1365 1399 - 1917 2345 2344 2327 2325 - 463 595 601 616 615 - 699 893 883 897 896 - 3831 4760 4768 4762 4761 - 1905 2328 2515 2318 2514 - 3773 4707 4705 4706 4711 - 3923 4885 4891 4887 4893 - 2281 2864 2859 2856 2858 - 1833 2255 2268 2266 2267 - 545 727 734 729 730 - 115 163 164 158 165 - 3443 4515 4514 4513 4301 - 927 1194 1154 1195 1155 - 3235 5280 4076 5279 5278 - 1221 1544 1538 1540 1539 - 1837 2547 2250 2546 2252 - 1369 1704 1703 1701 1708 - 1073 1393 5186 1392 1385 - 3707 4652 4651 4650 4630 - 503 648 631 647 632 - 363 481 474 480 479 - 1859 2300 2354 2282 2281 - 3657 4594 4588 4590 4591 - 3427 4287 4293 5277 4479 - 3695 4637 4619 4638 4639 - 3269 4176 4130 4175 4112 - 1727 2197 2200 2140 2123 - 901 1138 2605 1161 2604 - 1551 1949 1954 1948 1953 - 3307 4151 4136 4150 4149 - 2263 2838 2840 5161 5162 - 3493 4489 4360 4365 4359 - 683 869 877 969 970 - 913 1149 2600 1152 2599 - 1337 1674 1673 1668 1672 - 3807 5097 4740 5100 4793 - 3515 5276 4382 5275 5274 - 751 945 944 959 960 - 3563 4422 4421 5273 4420 - 1521 1868 1866 1864 1865 - 1889 2306 2334 2305 2333 - 3263 5272 4108 5271 5270 - 4075 5038 5037 5040 5039 - 1355 1692 1739 1914 1913 - 3083 5129 5132 3921 3920 - 1663 2050 2057 24 2056 - 3541 4470 4469 4400 4399 - 755 955 953 954 948 - 2003 2445 2444 2430 2443 - 611 792 790 817 816 - 19 60 1325 62 1327 - 3375 4229 4228 4227 4225 - 3731 4716 4674 4689 4688 - 1343 1686 1670 1685 1684 - 1885 2545 2296 2294 2295 - 269 377 370 372 371 - 333 440 447 466 467 - 3111 3951 3945 3947 3946 - 205 263 362 346 361 - 129 189 197 679 680 - 1095 1408 1455 1927 1456 - 2017 2449 2447 2448 2446 - 381 509 490 508 491 - 1039 1468 1369 1472 1471 - 1903 2311 2336 2317 2335 - 1525 1882 1875 1881 1867 - 3381 4249 4231 4248 4472 - 147 214 203 222 221 - 983 1214 2617 1216 2616 - 1679 2585 2584 2583 2086 - 1739 2144 2145 2162 2189 - 2193 2818 2822 2773 2788 - 2543 3160 3176 3153 3177 - 4051 5033 5032 5018 5019 - 1915 2326 2331 2330 2329 - 659 857 856 860 859 - 3327 4148 4147 4144 4502 - 619 794 796 801 800 - 1481 1827 1834 1833 1832 - 1179 1496 1495 1491 1493 - 1493 1842 1843 1836 1837 - 1669 2083 2101 2222 2221 - 1705 2115 2112 2220 2532 - 451 598 590 614 617 - 3265 4131 4132 4109 4116 - 3277 4115 4192 4114 4155 - 183 246 243 685 686 - 2019 5213 5212 23 22 - 543 721 720 825 826 - 3975 4950 5113 4949 4996 - 1407 1906 1729 1909 1908 - 3267 4110 4117 4157 4118 - 4091 5054 5049 5053 5052 - 3513 4381 4376 4372 4373 - 1303 1652 1651 1650 1644 - 4033 5091 5090 5007 5008 - 1695 2582 2581 2580 2108 - 3939 4902 4903 4932 4931 - 4013 5006 4977 4993 4992 - 1455 1813 1799 1822 1821 - 3465 4488 4487 4332 4355 - 943 1176 1169 1190 1189 - 3367 4260 4259 4223 4222 - 1323 1657 1658 1663 1662 - 1801 2226 2243 2347 2346 - 3989 4960 4958 4959 4964 - 3415 4314 4279 4481 4480 - 307 410 418 420 419 - 713 915 910 932 933 - 2029 5201 5268 2470 5267 - 1403 1738 1737 1727 1736 - 2287 2863 2877 2857 2878 - 2507 3187 3186 3122 3125 - 4095 5066 5065 5051 5048 - 1035 1614 1351 1616 1352 - 1341 1669 1754 1748 1755 - 555 738 731 737 733 - 3671 4608 4607 4597 4606 - 915 1160 1158 1159 1151 - 447 570 577 571 576 - 3715 5099 4717 5098 4664 - 1723 2124 2128 2134 2127 - 3803 5095 4737 5096 4739 - 3143 4179 3986 4180 3985 - 741 968 935 967 958 - 1351 1689 1694 1695 1696 - 519 709 706 725 724 - 1381 1714 2559 1732 2558 - 1089 1414 1413 1407 1406 - 311 432 413 431 412 - 1759 2164 2190 2192 2191 - 2035 2485 2484 2483 2467 - 2517 3134 3148 3141 3147 - 21 61 1329 181 1328 - 211 336 270 335 289 - 3435 4291 4317 4290 4289 - 3295 4124 4191 4129 4190 - 1299 1639 1638 1643 1637 - 3359 4323 4250 4322 4256 - 1895 2544 2543 2542 21 - 371 493 492 497 486 - 1375 1943 1942 1702 1941 - 1603 2004 2528 2003 2530 - 593 774 782 784 783 - 675 873 866 971 973 - 73 127 119 122 121 - 3549 4405 4407 4447 4450 - 3935 4895 4929 4890 4889 - 1817 2241 2240 2234 2236 - 4081 5044 5151 5150 5149 - 1715 2119 2520 2525 2524 - 3615 4536 4535 4534 4533 - 1567 1964 1962 1960 1961 - 1083 1388 1390 1387 1389 - 3993 4966 4965 4962 4961 - 2255 2884 2883 2832 2831 - 1823 2239 2238 2237 2500 - 1935 2518 2414 2517 2363 - 163 231 230 232 225 - 177 235 360 237 359 - 1799 2223 2270 2225 2224 - 2001 2499 2442 2498 2428 - 65 353 118 358 116 - 3691 4623 4622 4618 4621 - 3681 4614 4612 4623 4622 - 3683 4612 4609 4622 20 - 3693 4649 4623 4617 4618 - 3661 4591 4646 4648 4649 - 3692 4648 4649 4647 4617 - 3680 4646 4614 4649 4623 - 3687 4645 4610 4646 4614 - 3686 5185 5184 4645 4610 - 3659 4588 4645 4591 4646 - 3659 5183 5185 4588 4645 - 3682 4613 4611 4612 4609 - 3687 4610 4613 4614 4612 - 3685 5182 5181 4613 4611 - 3684 5184 5182 4610 4613 - 3783 4731 4725 4730 4747 - 926 1154 1160 1155 1159 - 925 1164 1163 1154 1160 - 924 1163 1162 1160 1158 - 1927 2359 2357 2358 2379 - 1925 2516 2365 2515 2359 - 1924 2365 2364 2359 2357 - 1923 2515 2359 2514 2358 - 1921 2356 2381 2355 2380 - 1922 2358 2379 2356 2381 - 1920 2513 2356 2512 2355 - 1923 2514 2358 2513 2356 - 1951 2386 2378 2385 2375 - 1927 2357 2386 2379 2385 - 1926 2364 2384 2357 2386 - 1943 2384 2383 2386 2378 - 1983 2401 2399 2400 2404 - 3567 4427 4429 4423 4425 - 3559 4419 19 4429 4428 - 3553 4418 4419 4427 4429 - 3552 4444 4440 4442 4418 - 3575 4437 4431 4439 4444 - 3579 4439 4444 4443 4442 - 3575 4431 4441 4444 4440 - 3566 4426 4427 5269 4423 - 3555 4442 4418 4426 4427 - 3555 4443 4442 5268 4426 - 3565 5268 4426 5267 5269 - 3559 4417 4416 4419 19 - 3558 4440 4417 4418 4419 - 3557 4446 4445 4417 4416 - 3556 4441 4446 4440 4417 - 1977 2511 2401 2510 2400 - 87 129 177 692 693 - 119 178 159 177 157 - 86 131 178 129 177 - 723 926 918 1313 1312 - 721 921 920 926 918 - 720 928 921 927 926 - 3669 4798 4605 4797 4596 - 87 135 129 691 692 - 3151 4035 4036 4034 3987 - 351 464 456 463 18 - 1699 2139 2133 2109 2132 - 125 161 1316 696 1338 - 2179 2763 2890 5167 5166 - 1155 1532 1481 1531 1479 - 295 406 405 425 424 - 541 715 2609 827 2608 - 47 83 91 104 105 - 175 234 233 248 249 - 615 791 793 814 815 - 3191 4019 4051 4026 4050 - 197 259 266 345 344 - 3397 4512 4268 4261 4267 - 3595 4522 4529 4525 4524 - 1693 2199 2198 2095 2196 - 3507 4374 4369 4375 4458 - 1667 2104 2103 2085 2102 - 4029 4985 4995 4986 4994 - 3709 4633 4632 4631 4636 - 3889 4852 4855 5085 5084 - 2025 2473 2471 2472 2451 - 255 315 322 343 342 - 219 285 284 283 279 - 3495 4366 4379 4377 4378 - 1971 2411 2410 2402 2409 - 95 133 139 134 138 - 2037 5237 5239 2482 5238 - 3981 4971 4953 4970 4967 - 4085 5059 5045 5058 5046 - 4043 5013 5017 5012 5010 - 3875 4848 4849 4847 4842 - 3081 5134 5133 3919 3941 - 1395 1726 1731 1725 1730 - 1365 1699 1740 1712 1741 - 111 176 151 175 150 - 1933 2361 2362 2365 2364 - 2523 3140 3139 5154 5155 - 3815 4752 4751 4750 4749 - 2455 3075 3082 3074 3081 - 2491 3106 3188 3098 3189 - 167 224 358 250 357 - 3457 4386 4330 4385 4457 - 3543 4468 4467 4402 4401 - 737 966 934 965 936 - 765 964 951 1304 1303 - 2297 2876 2871 2868 2870 - 3941 4928 4927 4905 4904 - 2207 2784 2787 2783 2786 - 749 939 991 961 990 - 2191 2824 2823 2768 2767 - 1659 2051 2055 2052 2054 - 1973 2398 2397 2418 2396 - 1615 2082 2081 2007 2080 - 103 145 1321 147 1322 - 667 905 853 974 975 - 1149 1932 1445 1931 1928 - 693 881 888 895 894 - 31 71 70 66 69 - 1747 2579 2157 2150 2156 - 3727 4673 4670 4669 5148 - 3187 4018 4020 4025 4027 - 1115 1419 1457 1459 1462 - 599 821 777 820 776 - 1517 1863 1855 1857 1856 - 1275 1582 1580 1586 1585 - 561 746 739 741 742 - 1995 2422 2427 2423 2421 - 515 705 704 708 707 - 1533 1891 1890 1889 1872 - 3587 4653 4654 4519 4576 - 3629 4657 4658 4555 4552 - 2015 2497 2441 2496 2437 - 903 1145 1144 1163 1162 - 1293 1680 1681 1631 1633 - 1003 1242 1234 1258 1259 - 1475 1823 1825 1883 1884 - 991 1266 1221 1265 1220 - 3717 4667 4683 4671 4672 - 3591 4520 4521 4527 4526 - 3459 4338 4336 4337 4328 - 1175 1503 1623 1489 1622 - 3369 4318 4226 4320 4224 - 1117 1420 1473 1421 1458 - 1165 1598 1600 1484 1483 - 1029 1612 1348 1613 1355 - 3387 4235 4234 4237 4233 - 3223 5121 5123 5120 4165 - 1431 1895 1779 1896 1774 - 1453 1794 1793 1820 1792 - 319 417 671 510 670 - 3783 4724 4723 4731 4725 - 347 462 455 461 457 - 299 409 408 404 407 - 3867 4839 4838 4869 4868 - 263 366 373 368 375 - 1405 1728 2551 2553 2552 - 1539 1950 1944 1952 1945 - 427 555 557 562 558 - 1813 2271 2273 2233 2272 - 3073 3915 3918 3974 3975 - 1897 2343 2342 2310 2312 - 3853 4821 5112 4817 5111 - 1707 2129 2131 2114 2130 - 1575 1990 1979 1972 1978 - 159 210 217 220 219 - 3091 3926 3930 3937 3936 - 171 257 227 256 229 - 3355 4217 4216 4212 4215 - 423 552 560 554 559 - 3413 4273 4276 4272 4271 - 3299 4133 4500 4492 4499 - 369 482 662 485 672 - 3161 4001 4000 3996 3995 - 1319 1656 1655 1654 1653 - 39 87 78 106 107 - 207 265 356 264 355 - 1779 2181 2176 2183 2182 - 3239 4083 4084 4078 4077 - 1185 1512 1504 1514 1513 - 3257 4107 4106 4089 4088 - 1741 2149 5200 2148 2143 - 1171 1530 1502 1534 1533 - 1189 1601 1603 1507 1505 - 412 589 542 588 547 - 179 242 241 245 244 - 2205 2778 2785 5171 5170 - 1873 2541 2291 2540 2298 - 573 744 750 833 832 - 1389 1940 1939 1745 1717 - 4039 5077 5076 5029 5075 - 1385 1938 1937 1718 1716 - 119 177 157 693 694 - 3863 4837 4836 4825 4829 - 1869 2338 2503 2340 2516 - 3645 4561 4560 5147 5146 - 1417 1760 2567 1759 2566 - 3675 4794 4600 4663 4662 - 3945 4909 4946 4906 4945 - 4011 5005 5004 4978 4976 - 3733 4675 4812 4811 4810 - 1721 2201 2203 2125 2122 - 29 113 64 115 65 - 3197 4024 4023 4022 4021 - 3249 4090 4087 4085 4098 - 3673 4796 4603 4795 4599 - 531 718 717 712 719 - 3529 4466 4465 4464 4393 - 2439 3063 3192 3062 3193 - 589 767 773 772 771 - 3647 4564 4563 4562 17 - 1629 2067 2019 2066 2018 - 1589 2002 1982 2001 1981 - 2183 2762 2764 5164 5165 - 1057 1374 1375 1470 1469 - 977 1223 1213 1222 1212 - 1143 1464 1441 1443 1442 - 4005 4973 4975 5145 5144 - 661 851 989 972 988 - 731 922 929 924 928 - 1203 1602 1515 1529 1516 - 3199 4028 4049 4040 4048 - 1821 2235 2511 2501 2510 - 195 329 258 330 260 - 1959 2509 2508 2507 2407 - 3501 4486 4485 4484 4367 - 1267 1584 1576 1579 1578 - 1363 1706 1698 1705 1700 - 3915 4884 4880 4883 4882 - 963 1269 1200 1268 1230 - 317 421 415 519 518 - 2189 2819 2894 2766 2893 - 973 1205 2596 1227 2618 - 109 174 149 173 168 - 551 728 735 754 753 - 267 376 369 514 515 - 1333 1665 1747 1671 1753 - 2269 2881 3215 2842 3216 - 3331 4511 4194 4196 16 - 1581 1976 1975 1974 1973 - 1555 1957 1963 2076 2075 - 585 818 766 1297 1298 - 2031 2453 2458 2452 2457 - 1957 2389 2388 2406 2393 - 1847 5251 5253 2257 5252 - 1967 2395 5194 2394 2390 - 2033 2461 2460 2459 2476 - 4055 5021 5020 5060 5061 - 3879 4841 4863 5082 5083 - 1367 1711 1707 1710 1709 - 4045 5089 5088 5011 5016 - 1387 1719 1722 1733 1721 - 3817 4759 4758 4753 4757 - 3655 4800 4587 4799 4584 - 1979 2417 2403 2416 2415 - 2459 3078 3073 3077 3076 - 161 223 354 226 353 - 3539 4398 4410 4408 4409 - 3967 4917 4920 4919 4918 - 3527 4390 4413 4452 4451 - 3959 4916 4914 4915 4944 - 2247 2826 2828 5157 5158 - 745 938 987 940 986 - 2199 2780 2776 2782 2781 - 1641 2578 2038 2058 2059 - 1639 2079 2078 2035 2070 - 97 152 144 169 170 - 1145 1449 1444 1448 1446 - 655 899 841 900 842 - 1135 1436 2561 1438 2560 - 3177 4017 4030 4011 4032 - 3085 5127 5128 4046 4045 - 3741 4679 5143 5142 5141 - 605 779 2587 781 2588 - 1235 1549 1556 1551 1552 - 1509 1852 1858 1851 1850 - 3631 4554 4551 4571 4570 - 3621 4655 4656 4583 4553 - 567 747 740 828 829 - 911 1148 1147 1143 1146 - 937 1175 1168 1170 1171 - 1945 2370 2493 2369 2492 - 1017 1257 1248 1251 1250 - 1477 1831 1845 1824 1846 - 3735 4677 4676 4681 4682 - 1169 1501 1500 1488 1490 - 1103 1426 1428 1410 1427 - 3745 4692 4690 4698 4697 - 1167 1597 1599 1487 1486 - 1153 1480 1485 1478 1499 - 1047 1373 1358 1372 1361 - 343 452 460 454 459 - 1415 1782 1758 1783 1757 - 1439 1893 1816 1894 1776 - 3791 4727 4733 4729 4732 - 3871 4828 4827 4826 4867 - 293 399 664 422 667 - 1553 1967 1966 1958 1955 - 1377 1720 1713 1715 2557 - 1559 2577 1956 2576 2074 - 3119 3980 3979 3978 3970 - 1803 2231 2230 2227 2242 - 1583 1980 2521 2073 2522 - 155 208 216 212 215 - 1591 1986 1988 1995 1994 - 173 255 228 254 247 - 3421 4327 4326 4274 4325 - 3441 4295 4316 4297 4315 - 367 477 663 476 675 - 373 484 674 483 673 - 3301 4189 4188 4498 4497 - 63 97 103 99 102 - 217 276 352 278 351 - 1749 2152 2151 2209 2208 - 1783 2174 5198 2188 2187 - 3227 4075 4070 4074 4073 - 1207 1528 1517 1522 1523 - 1735 2160 2146 2161 2163 - 187 253 238 683 684 - 759 947 985 1306 1309 - 1807 2539 2229 2538 2228 - 3125 3977 3976 3956 3955 - 1597 1983 2531 2534 2533 - 4007 4981 4979 4980 4974 - 1867 2353 2352 2289 2351 - 1409 1815 2562 1756 2563 - 3683 4622 20 4621 4620 - 1811 2246 2245 2244 2232 - 1531 1878 1871 1877 1876 - 57 98 96 182 183 - 3743 4809 4808 4680 4807 - 1311 1646 1645 1642 1904 - 1775 2207 2173 2206 2205 - 1411 1767 2564 1766 2565 - 2497 3118 3121 5185 5184 - 3825 4767 4806 4792 4805 - 3519 5266 4371 5265 5264 - 2435 3055 3057 5177 5178 - 2501 3120 3119 5182 5181 - 1447 1811 1791 1812 1796 - 3339 4251 4201 4252 4253 - 2177 2769 2888 2761 2889 - 4089 5050 5074 5047 5140 - 3135 3967 3966 3960 15 - 390 585 523 584 522 - 3337 4207 4206 4198 4202 - 2265 2841 2848 2847 2846 - 1423 1785 1768 1784 1763 - 199 331 261 341 340 - 3583 4455 4438 4436 4437 - 3763 4710 4709 4701 4703 - 535 761 711 831 830 - 1211 1519 1536 1518 1535 - 3117 3948 3969 3953 3968 - 4054 5026 5064 5062 5063 - 1479 1826 1847 1885 1886 - 3850 4818 5110 4834 5109 - 1535 1888 1887 1874 1873 - 3313 4174 4173 4172 4145 - 1163 1482 1621 1498 1620 - 1987 5187 5225 2420 5224 - 353 470 478 501 500 - 3099 3940 3939 3931 14 - 1865 2288 2283 2284 2502 - 4059 5031 5030 5027 5022 - 3509 4380 4388 4368 4370 - 3919 4896 4943 4879 4942 - 3895 4853 5108 5080 5107 - 3931 4894 4901 4900 4899 - 79 126 120 141 140 - 3821 4783 4756 4782 4781 - 3837 4775 4774 4764 4766 - 2457 3113 3112 3070 3072 - 2445 3115 3116 3059 3061 - 3535 4397 4396 4394 4395 - 2303 2875 3213 2869 3214 - 3955 4923 4913 4922 4921 - 2249 2829 2836 2835 2834 - 1655 2053 2062 2049 2048 - 1975 2408 2489 2487 2490 - 1635 2037 2069 2034 2068 - 649 901 838 902 862 - 645 836 984 865 983 - 11 59 51 58 57 - 1247 1592 1591 1563 1593 - 3723 4668 4684 5139 5138 - 639 807 809 812 808 - 513 698 2613 700 2612 - 3597 4528 4575 4523 4577 - 527 757 702 756 703 - 1939 2377 2366 2376 2491 - 959 1188 1181 1273 1274 - 1023 1256 1249 1255 13 - 975 1211 1210 1229 1228 - 3467 4483 4482 4334 4333 - 3765 4702 4715 4722 4721 - 3379 4230 4240 4241 4242 - 1069 1405 1404 1403 1383 - 3389 4243 4236 4473 4474 - 3785 4728 4726 5094 4746 - 1463 1802 1807 1892 1819 - 355 507 472 506 499 - 3667 4601 4595 4602 4643 - 459 594 600 596 599 - 287 385 392 397 396 - 1571 1970 1977 2071 2072 - 1855 2261 2265 2264 2263 - 1611 2006 2011 2026 2010 - 1703 2111 2116 2118 2117 - 149 213 204 206 205 - 3101 3938 3934 3932 3933 - 365 475 677 498 676 - 3141 3984 4003 3983 4002 - 3303 4162 4161 4135 4134 - 221 277 350 288 349 - 1327 1659 1675 1661 1676 - 1757 2155 2154 2159 2158 - 189 252 239 251 240 - 1215 1524 1521 1922 1520 - 3231 4069 4064 4066 4065 - 443 569 575 568 567 - 657 850 982 852 981 - 571 743 749 745 748 - 1123 1434 1433 1451 1431 - 465 610 602 656 655 - 1607 2009 2523 2005 2529 - 3933 4930 4941 4888 4940 - 1255 1572 1571 1566 1565 - 3226 4100 4163 4099 4168 - 55 93 95 100 94 - 203 269 262 268 267 - 1499 1921 1920 1838 1840 - 3547 4406 12 4449 4448 - 725 925 919 921 920 - 735 923 927 1310 1311 - 3330 4205 4193 4204 4203 - 1061 1380 2549 1379 2550 - 3643 4574 4559 4567 4566 - 379 505 489 504 488 - 309 430 411 516 517 - 1310 1750 1752 1903 1911 - 3157 4038 4037 4007 3993 - 2243 2827 2851 5159 5160 - 3155 4033 4047 3994 3992 - 1109 1418 1422 1423 1424 - 3545 4412 4411 4404 4403 - 2023 2456 2455 2454 11 - 907 1199 1140 1198 1164 - 767 957 956 1302 1305 - 3823 4755 4771 4773 4772 - 3663 4589 4648 4604 4647 - 1193 1508 1527 1926 1925 - 1037 1357 1356 1353 1368 - 1953 2506 5190 2505 2387 - 181 236 348 687 688 - 3771 5101 4704 5103 5102 - 3751 4789 4787 4788 4691 - 3075 3925 3923 3924 3917 - 1399 1744 1735 1724 1734 - 2493 3105 3101 3099 3100 - 3523 4463 4462 4461 4391 - 71 117 123 118 125 - 2203 2777 2779 5168 5169 - 2217 2801 2796 2793 2795 - 2271 2887 2886 2844 2843 - 23 68 67 179 180 - 671 903 854 904 855 - 127 167 166 695 697 - 587 824 769 823 768 - 3183 4016 4015 4182 4013 - 621 797 795 799 798 - 1989 2425 2419 2426 2477 - 909 1193 1141 1197 1142 - 3323 4171 4146 4170 4143 - 971 1204 2594 1206 2595 - 1487 1919 1918 1830 1829 - 1763 2166 2167 2204 2527 - 1042 1360 1402 1359 1401 - 1099 1409 1425 1415 1411 - 1055 1371 1367 1370 1364 - 3219 4072 4062 4071 4061 - 337 450 458 449 448 - 291 398 666 400 665 - 409 582 539 587 541 - 1883 2299 2304 2293 2303 - 1537 2212 2214 1951 2213 - 135 191 199 193 198 - 1701 2138 2137 2219 2110 - 3651 4585 4593 4586 4592 - 3159 4006 3999 4005 4004 - 3689 4616 4615 4641 4640 - 3131 3963 3962 4494 4493 - 3247 4094 4082 4093 4081 - 3205 5116 4054 4052 4053 - 1785 2179 2186 2177 2185 - 3637 4582 4557 4581 5137 - 753 946 980 949 979 - 435 574 563 573 10 - 4071 5057 5056 5034 5055 - 1573 2000 1991 1999 1971 - 3625 4549 4548 4569 4568 - 1527 1880 1870 1879 1869 - 1899 2316 2313 2315 2314 - 687 870 876 871 875 - 2447 3066 3060 3065 3064 - 577 770 762 1299 1300 - 3487 4347 4349 4354 4353 - 631 803 811 805 810 - 385 583 520 586 524 - 1631 2064 2063 2065 2020 - 3309 4152 4139 4153 4491 - 3481 4348 4344 4346 4345 - 3407 4510 4509 4508 4324 - 3757 4694 4696 4720 4719 - 1469 1804 1814 1806 1805 - 3211 5131 5130 4060 4055 - 2463 3111 3114 3071 3080 - 1515 1862 1854 1861 9 - 3411 4313 4285 4312 4310 - 225 291 302 325 326 - 243 321 309 320 311 - 4061 5025 5073 5024 5023 - 1353 1697 1690 1912 1915 - 3913 4881 4939 4878 5078 - 3927 4886 4892 4898 4897 - 2537 3159 3155 3152 3154 - 3429 4306 4305 5263 4286 - 3951 4925 4938 4907 4937 - 2241 2833 2849 2825 2850 - 1941 2413 2412 2368 2367 - 123 160 1314 162 1315 - 5 46 56 72 73 - 1127 1463 1432 1450 1452 - 3181 4184 4012 4183 4039 - 559 736 732 752 751 - 1295 1636 1632 1635 1634 - 3633 4573 4572 4580 4556 - 1831 2537 2536 2535 2278 - 1049 1363 1362 1467 1466 - 1181 1492 1619 1930 1929 - 4025 4991 4990 5136 5135 - 331 438 446 442 445 - 477 606 613 608 607 - 431 561 556 580 581 - 3679 4598 4642 4661 4644 - 1599 1985 1984 1993 1992 - 3335 4507 4506 4195 4197 - 143 202 195 201 196 - 3107 4041 3952 4042 3944 - 1683 2099 2097 2098 2092 - 3271 4169 4111 4156 4158 - 1191 1506 1525 1511 1526 - 4003 5106 5105 4972 5104 - 437 572 564 566 565 - 3475 4352 4351 4340 4339 - 3601 4542 4541 4530 4537 - 2047 2481 2469 2480 2466 - 3943 4924 4926 4910 4911 - 113 172 156 178 159 - 3835 4770 4765 4769 4763 - 1107 1617 1416 1430 1429 - 2007 2438 2429 2440 2439 - 3797 4741 4736 4743 4742 - 1329 1936 1935 1664 1666 - 3449 4299 4303 4298 4302 - 139 200 194 681 682 - 2259 2839 3218 5163 5172 - 1485 1917 1916 1828 1848 - 3873 4840 4843 5079 5081 - 2519 3180 3185 3136 3146 - 239 307 300 306 301 - 3979 4968 4951 4969 4952 - 215 333 273 334 274 - 1909 2321 2324 2320 2319 - 1835 5226 5228 2251 5227 - 2473 3094 3089 3086 3088 - 4023 5003 5002 5001 4989 - 2235 2813 2891 2805 2892 - 1643 2575 2041 2574 2040 - 3455 4505 4504 5262 4300 - 529 759 710 760 713 - 3173 5126 5125 4009 4010 - 967 1207 1208 1203 1209 - 3767 4714 4713 4708 4712 - 1291 1683 1682 1630 1647 - 1467 1810 1803 1818 1817 - 1425 1780 1781 1773 1789 - 1259 1587 1567 1588 1575 - 1585 2210 2215 1987 1989 - 3115 4044 3950 4043 3949 - 1851 2269 2259 2277 2262 - 43 82 90 84 89 - 3447 4296 4309 4308 4307 - 3697 4635 4628 4634 4627 - 3319 4142 4154 4141 4140 - 475 605 612 654 653 - 3171 4008 4029 4014 4031 - 2021 5250 5249 2450 5191 - 1891 2309 2307 2308 8 - 579 764 2593 1301 2601 - 1335 1934 1933 1667 1746 - 985 1217 1226 1225 1224 - 1777 2195 5195 2194 2175 - 1543 1946 1996 1998 1997 - 1755 2573 2572 2571 2153 - 3521 4389 4415 4392 4414 - 2045 2468 2475 2465 2474 - 275 389 378 395 394 - 1359 1691 1743 1693 1742 - 3819 4780 4779 4778 4754 - 2527 3144 3143 3138 3142 - 3579 4434 4439 5261 4443 - 2223 2800 2814 2794 2815 - 2237 2812 2808 2806 2807 - 739 943 942 937 941 - 1227 1541 1546 1559 1560 - 1507 1849 1860 1853 1859 - 953 1187 1180 1182 1183 - 1005 1241 1235 1237 1236 - 3383 4247 4246 4245 4232 - 1249 1589 1573 1590 1564 - 1547 2570 2211 2569 1947 - 1617 2032 2031 2014 2030 - 3843 4814 4819 4815 4820 - 4065 5260 5035 5259 5258 - 1 111 45 110 47 - 3145 3988 3989 3991 3990 - 2437 3056 3194 5180 5179 - 1877 2350 2349 2292 2348 - 3859 4824 4830 4823 4831 - 3574 4433 4432 4431 4441 - 625 804 802 1308 1307 - 3357 4503 4214 4321 4213 - 1929 2360 2382 2384 2383 - 3739 4687 4678 4686 4685 - 1985 2495 5188 2494 2424 - 2521 3137 3145 5156 5183 - 3887 4851 4865 4850 4864 - 4067 5043 5041 5042 5036 - 763 963 950 962 952 - 2451 3067 3069 5173 5174 - 3845 4936 4816 4935 4813 - 3627 4660 4659 4550 7 - 603 785 778 789 788 - 965 1264 1201 1267 1202 - 301 429 402 428 423 - 259 364 374 512 513 - 1027 1350 1349 1354 1347 - 3213 4059 4056 4058 4057 - 3349 4211 4220 4209 4219 - 3272 4187 4186 4113 4185 - 3221 4063 4164 4067 4068 - 2503 3131 3128 3129 6 - 3987 5087 5086 5068 4957 - 1665 2090 2088 2089 2084 - 921 1196 1153 1276 1275 - 1691 2107 2096 2106 2100 - 3503 4364 4363 4362 4361 - 3253 4105 4104 4091 4086 - 1863 2280 2285 2287 2286 - 3463 4331 4459 4456 4460 - 1277 1596 1583 1595 1581 - 1647 2039 2077 2044 2043 - 315 414 668 416 669 - 1071 1396 1395 1378 1394 - 1491 1844 1900 1835 1899 - 1733 2568 2142 2141 2147 - 33 85 77 88 79 - 3395 4263 4262 4311 4282 - 3805 4738 4744 4791 4790 - 3983 4956 4999 4997 4998 - 3419 4280 4471 4478 4477 - 3391 4239 4238 4476 4475 - 3623 4547 4546 4545 4544 - 13 52 1317 74 1326 - 105 148 155 154 153 - 2197 2821 2820 2774 2775 - 3581 4454 4453 5257 4435 - 899 1137 2602 1139 2603 - 283 383 391 387 390 - 357 503 471 502 473 - 1271 1577 1606 1605 1604 - 3703 4626 4625 4629 4624 - 3431 4304 4292 4288 4294 - 1853 5215 5214 2260 5189 - 583 763 2591 765 2592 - 1907 2323 2513 2322 2512 - 2557 3173 3169 3167 3168 - 1147 1453 2554 1447 2555 - 3755 4695 4804 4718 4803 - 3255 4103 4102 4097 4096 - 1687 2094 2135 2093 2136 - 453 597 591 593 592 - 1393 1723 1907 1905 1910 - 1773 2184 2169 2168 2172 - 419 553 551 578 579 - 691 880 891 882 890 - 643 845 843 849 848 - 297 427 401 426 403 - 471 609 603 611 604 - 3245 4080 4167 4079 4166 - 491 646 622 649 624 - 2479 3093 3108 3087 3107 - 707 906 913 930 931 - 1305 1649 1640 1648 1641 - 1067 1377 1376 1382 1381 - 923 1157 1156 1270 1271 - 517 699 2611 722 2610 - 3311 4160 4159 4137 4138 - 1243 1558 1555 1557 1554 - 2499 3127 3132 3126 3130 - 321 444 433 436 435 - 2257 2845 3217 2837 3219 - 1261 1570 1568 1569 1574 - 1825 2276 2247 2275 2274 - 1079 1386 5202 2548 2556 - 15 54 1319 53 1318 - 3971 4955 4948 4954 4947 - 3259 5256 4092 5255 5254 - 679 872 867 874 868 - 701 892 885 887 886 - 3753 4700 4802 4699 4801 - 3891 4861 4862 4860 4854 - 327 436 435 443 434 - 758 956 947 1305 1306 - 759 949 979 947 985 - 757 948 949 956 947 - 1060 1379 2550 2573 2572 - 2043 2464 2473 2463 2472 - 2044 2465 2474 2473 2471 - 2043 2466 2465 2464 2473 - 3448 4513 4301 4299 4303 - 2027 2475 2453 2474 2452 - 2031 2470 5267 2453 2458 - 2028 2476 2470 2475 2453 - 3453 4504 4299 4300 4298 - 2046 2469 2468 2466 2465 - 2032 2459 2476 2468 2475 - 2034 2467 2459 2469 2468 - 3105 3944 3947 4517 4501 - 3418 4279 4280 4480 4478 - 3419 4278 4275 4280 4471 - 3414 4271 4278 4279 4280 - 3452 4502 4513 4504 4299 - 241 328 327 321 309 - 250 323 321 317 320 - 240 326 328 323 321 - 1063 1375 2573 1469 2571 - 255 314 315 339 343 - 253 318 317 315 322 - 254 316 318 314 315 - 674 877 873 970 971 - 249 312 313 316 318 - 251 325 326 313 323 - 248 324 325 312 313 - 3345 4517 4501 4208 4210 - 3491 4485 4489 4367 4365 - 3491 4481 4480 4489 4360 - 3434 4289 4481 4485 4489 - 675 874 868 873 866 - 235 297 336 327 335 - 211 334 274 336 270 - 234 301 334 297 336 - 2509 3182 3181 3124 3123 - 214 273 285 274 283 - 213 271 280 285 284 - 212 272 271 273 285 - 681 875 874 877 873 - 239 300 333 301 334 - 215 332 272 333 273 - 238 299 332 300 333 - 3104 3947 3946 4501 4490 - 3435 4317 4314 4289 4481 - 3415 4272 4271 4314 4279 - 3410 4310 4272 4317 4314 - 2005 2439 2480 2479 2486 - 227 296 303 291 302 - 226 290 291 324 325 - 227 295 296 290 291 - 2040 2479 2486 2478 2462 - 230 304 307 303 306 - 237 298 299 307 300 - 236 305 298 304 307 - 2042 2480 2466 2486 2464 - 229 292 294 295 296 - 231 293 305 294 304 - 228 5 293 292 294 - 666 853 861 975 977 - 186 238 246 684 685 - 185 245 244 246 243 - 184 240 245 238 246 - 4021 5071 5000 5070 4984 - 1913 2341 2345 2337 2327 - 1871 2287 2286 2345 2344 - 1861 2303 2287 2341 2345 - 1126 1432 1434 1452 1451 - 1870 2285 2288 2286 2284 - 1867 2289 2351 2288 2283 - 1858 2281 2289 2285 2288 - 1125 1440 1439 1434 1433 - 1860 2304 2280 2303 2287 - 1863 2282 2281 2280 2285 - 1862 2302 2282 2304 2280 - 2013 2493 2434 2492 2433 - 1524 1875 1878 1867 1877 - 1530 1874 1873 1878 1871 - 1534 1872 1874 1875 1878 - 3806 4739 4743 4740 4738 - 1956 2406 2393 2411 2410 - 1983 2399 2411 2404 2402 - 1982 2407 2406 2399 2411 - 743 937 941 945 944 - 1955 2388 2395 2393 2394 - 1966 5275 5274 2395 5194 - 1955 2387 5275 2388 2395 - 1124 1462 1440 1432 1434 - 1959 2508 2389 2407 2406 - 1954 2505 2387 2389 2388 - 1958 2504 2505 2508 2389 - 2015 2496 2437 2434 2436 - 1515 1854 1880 9 1879 - 1527 1881 1867 1880 1870 - 1514 1856 1881 1854 1880 - 742 935 945 958 959 - 3511 4379 4380 4378 4368 - 3494 4358 4387 4380 4388 - 3495 4359 4358 4379 4380 - 3804 4743 4742 4738 4744 - 3499 4363 4366 4361 4377 - 3492 4365 4359 4366 4379 - 3500 4367 4365 4363 4366 - 743 936 937 935 945 - 3515 4382 4381 5274 4372 - 3512 4377 4378 4381 4376 - 3499 4361 4377 4382 4381 - 2012 2434 2436 2433 2435 - 247 310 331 338 341 - 194 330 260 331 261 - 247 311 330 310 331 - 2026 2452 2457 2456 2455 - 1846 2257 5252 2506 5190 - 1845 2258 2506 2504 2505 - 1844 2263 2257 2258 2506 - 2027 2474 2452 2471 2456 - 3497 5253 4364 5252 4362 - 3503 4484 4367 4364 4363 - 3502 4479 4484 5253 4364 - 2024 2471 2456 2451 2454 - 1847 2265 5251 2263 2257 - 3425 5277 4479 5251 5253 - 1854 5189 5277 2265 5251 - 326 445 436 447 443 - 755 953 946 948 949 - 752 961 990 946 980 - 754 960 961 953 946 - 418 677 553 676 578 - 183 243 236 686 687 - 180 237 359 236 348 - 182 244 237 243 236 - 3807 4740 4738 4793 4791 - 93 143 142 133 139 - 94 357 143 360 133 - 95 360 133 359 134 - 419 554 559 553 551 - 179 241 235 244 237 - 176 250 357 235 360 - 178 249 250 241 235 - 2014 2491 2496 2493 2434 - 2020 2458 5250 2457 2450 - 3564 5269 4423 5250 5249 - 2030 5267 5269 2458 5250 - 378 491 497 489 496 - 3997 4962 4961 5072 4963 - 4063 5073 5072 5023 5071 - 3996 5075 4962 5073 5072 - 430 556 589 581 588 - 4087 5064 5066 5063 5051 - 4062 5024 5023 5066 5065 - 4058 5022 5024 5064 5066 - 421 675 554 677 553 - 4057 5030 5025 5022 5024 - 4060 5029 5075 5025 5073 - 4056 5028 5029 5030 5025 - 1719 2206 2205 2126 2120 - 1519 1855 1882 1856 1881 - 1532 1889 1872 1882 1875 - 1519 1886 1889 1855 1882 - 408 587 541 589 542 - 4079 5248 5043 5247 5042 - 4077 5040 5039 5043 5041 - 4079 5246 5040 5248 5043 - 1401 1731 1738 1730 1727 - 4075 5037 5059 5039 5058 - 4086 5062 5063 5059 5045 - 4074 5061 5062 5037 5059 - 429 558 587 556 589 - 4078 5245 5038 5246 5040 - 4073 5060 5061 5038 5037 - 4072 5244 5060 5245 5038 - 377 496 495 487 494 - 2010 2436 2432 2435 2431 - 2011 2440 2439 2432 2479 - 2009 2437 2440 2436 2432 - 1209 1534 1533 1519 1536 - 4050 5018 5019 5021 5020 - 4049 5243 5021 5244 5060 - 4048 5242 5018 5243 5021 - 1208 1523 1534 1521 1519 - 4053 5032 5031 5019 5027 - 4036 5015 5028 5031 5030 - 4042 5010 5015 5032 5031 - 1213 1521 1519 1520 1518 - 4051 5241 5033 5242 5018 - 4041 5012 5010 5033 5032 - 4040 5240 5012 5241 5033 - 223 283 279 282 281 - 433 566 565 574 563 - 432 576 566 575 574 - 442 575 574 567 573 - 1201 1515 1532 1516 1531 - 2035 2484 2461 2467 2459 - 2039 2482 5238 2461 2460 - 2036 2477 2482 2484 2461 - 1159 1605 1604 1532 1481 - 3577 5239 4434 5238 5261 - 3578 4436 4437 4434 4439 - 3576 4435 4436 5239 4434 - 2262 2840 2839 5162 5163 - 1991 2419 5237 2477 2482 - 3580 5257 4435 5237 5239 - 1991 5224 5257 2419 5237 - 1622 2022 2015 2021 2012 - 2008 2441 2438 2437 2440 - 2007 2430 2443 2438 2429 - 2002 2428 2430 2441 2438 - 1400 1734 1733 1731 1738 - 3978 4969 4952 4971 4953 - 3995 4965 4971 4961 4970 - 3994 4964 4969 4965 4971 - 1202 1610 1605 1515 1532 - 3979 4951 4955 4952 4954 - 3969 5085 5084 4955 4948 - 3977 5083 5085 4951 4955 - 3116 3950 3948 3949 3953 - 3991 4958 4968 4964 4969 - 3976 5082 5083 4968 4951 - 3991 5081 5082 4958 4968 - 1722 2122 2206 2128 2126 - 3973 4949 4996 5005 5004 - 3970 4947 4949 4999 5005 - 4015 4999 5005 4998 4978 - 3118 3975 3978 3950 3948 - 3894 4855 4853 5084 5080 - 3895 5103 5102 4853 5108 - 3893 4854 5103 4855 4853 - 1563 1962 2000 1961 1999 - 3769 4704 4707 5102 4706 - 3775 4791 4790 4707 4705 - 3770 4793 4791 4704 4707 - 3119 3978 3970 3948 3969 - 3892 4862 5101 4854 5103 - 3771 5100 4793 5101 4704 - 3899 4857 5100 4862 5101 - 1716 2128 2126 2127 2121 - 2004 2429 2481 2439 2480 - 2047 2483 2467 2481 2469 - 2006 2443 2483 2429 2481 - 1993 2421 2426 2444 2485 - 3878 4850 4864 4841 4863 - 3879 4843 4841 5081 5082 - 3877 4842 4850 4843 4841 - 1992 2444 2485 2443 2483 - 3899 4858 4857 4861 4862 - 3897 4866 4858 4865 4861 - 3898 4865 4861 4864 4860 - 1988 2426 2477 2485 2484 - 3876 4849 4851 4842 4850 - 3887 4845 4866 4851 4865 - 3886 4846 4845 4849 4851 - 209 282 281 287 286 - 391 586 524 585 523 - 434 565 586 563 585 - 435 563 585 10 584 - 3209 5151 5131 5149 4060 - 3087 4051 3925 4050 3924 - 3086 3921 3920 3925 3923 - 3087 4045 3921 4051 3925 - 3772 4706 4711 4710 4709 - 3083 5132 5134 3920 3919 - 3636 4581 5137 5134 5133 - 3609 4533 4581 5132 5134 - 4088 5047 5140 5131 5130 - 3084 5128 5129 4045 3921 - 3611 4534 4533 5129 5132 - 3611 4539 4534 5128 5129 - 1628 2019 2022 2018 2021 - 3739 4678 4809 4685 4680 - 3607 4811 4810 4809 4808 - 3738 4682 4811 4678 4809 - 4090 5052 5047 5151 5131 - 3907 4883 4882 5091 5090 - 4035 5088 5091 5016 5007 - 3907 4875 4883 5088 5091 - 1383 1733 1721 1738 1737 - 4047 5236 5013 5240 5012 - 4046 5011 5016 5013 5017 - 4047 5235 5011 5236 5013 - 308 411 421 517 519 - 4044 5234 5089 5235 5011 - 3906 4876 4875 5089 5088 - 3909 5233 4876 5234 5089 - 1619 2024 2023 2022 2015 - 3732 4718 4803 4675 4812 - 3734 4719 4718 4676 4675 - 3735 4676 4675 4682 4811 - 316 420 419 421 415 - 3915 4880 4881 4882 4878 - 3918 4879 4942 4881 4939 - 3917 4899 4879 4880 4881 - 2259 2837 3219 2839 3218 - 3883 4943 4848 4942 4847 - 3882 4844 4846 4848 4849 - 3883 4940 4844 4943 4848 - 310 412 420 411 421 - 3919 4901 4896 4899 4879 - 3932 4888 4940 4896 4943 - 3934 4889 4888 4901 4896 - 1630 2020 2024 2019 2022 - 3603 4812 4542 4810 4530 - 3599 4525 4524 4542 4541 - 3593 4803 4525 4812 4542 - 2182 2764 2763 5165 5167 - 3908 5232 4874 5233 4876 - 3910 5231 4898 5232 4874 - 3911 4898 4897 4874 4877 - 2179 2761 2889 2763 2890 - 3926 4892 4894 4897 4900 - 3931 4890 4889 4894 4901 - 3930 4893 4890 4892 4894 - 2178 2770 2761 2764 2763 - 3925 5230 4886 5231 4898 - 3927 4887 4893 4886 4892 - 3924 5229 4887 5230 4886 - 1427 1772 1788 1771 1787 - 379 508 491 505 489 - 350 457 508 456 505 - 351 456 505 18 504 - 1588 2001 1981 2000 1991 - 1904 2331 2328 2329 2318 - 1919 2340 2516 2328 2515 - 1919 2325 2340 2331 2328 - 3971 4954 4947 4956 4999 - 1915 2336 2326 2335 2330 - 1916 2327 2325 2326 2331 - 1914 2337 2327 2336 2326 - 3983 4953 4956 4967 4997 - 1911 2334 2321 2333 2320 - 1910 2330 2329 2321 2324 - 1911 2335 2330 2334 2321 - 2443 3117 3190 3058 3191 - 3974 5113 5099 4996 5098 - 3761 4701 4703 5099 4717 - 3762 5107 4701 5113 5099 - 3768 5102 4706 5108 4710 - 1398 1724 1734 1726 1731 - 1397 1743 1726 1742 1725 - 1396 1741 1724 1743 1726 - 3980 4952 4954 4953 4956 - 1391 1735 1719 1734 1733 - 1387 1718 1716 1719 1722 - 1391 1717 1718 1735 1719 - 1471 1809 1804 1808 1806 - 1399 1740 1744 1741 1724 - 1390 1745 1717 1744 1735 - 1361 1700 1745 1740 1744 - 2491 3098 3189 3117 3190 - 3861 4825 4829 4824 4830 - 3857 4937 4825 4946 4824 - 3856 4946 4824 4945 4823 - 1471 1915 1914 1809 1804 - 1345 1688 1689 1687 1695 - 1351 1710 1709 1689 1694 - 1350 1755 1710 1688 1689 - 3763 5108 4710 5107 4701 - 1367 1707 1699 1709 1712 - 1364 1705 1700 1699 1740 - 1366 1708 1705 1707 1699 - 1470 1914 1913 1804 1814 - 1349 1754 1711 1755 1710 - 1371 1701 1708 1711 1707 - 1371 1753 1701 1754 1711 - 2490 3100 3098 3116 3117 - 3935 4929 4930 4889 4888 - 3944 4906 4945 4930 4941 - 3946 4933 4906 4929 4930 - 3916 4900 4899 4884 4880 - 1459 1912 1915 1800 1809 - 1457 1902 1800 1901 1801 - 1458 1911 1912 1902 1800 - 3905 4877 4884 4875 4883 - 1355 1690 1692 1915 1914 - 1358 1693 1742 1692 1739 - 1354 1696 1693 1690 1692 - 3904 4897 4900 4877 4884 - 1347 1752 1697 1911 1912 - 1352 1695 1696 1697 1690 - 1347 1687 1695 1752 1697 - 27 114 63 187 186 - 384 588 547 583 520 - 439 581 588 564 583 - 438 564 583 565 586 - 127 166 161 697 696 - 67 124 126 142 141 - 77 122 121 126 120 - 76 125 122 124 126 - 734 927 926 1311 1313 - 75 119 176 121 175 - 107 187 186 176 151 - 75 185 187 119 176 - 124 162 1315 161 1316 - 72 123 127 125 122 - 74 184 185 127 119 - 69 183 184 123 127 - 99 146 1324 171 1323 - 1188 1603 1602 1505 1529 - 1203 1611 1610 1602 1515 - 1242 1554 1611 1603 1602 - 1563 1959 2001 1962 2000 - 3831 4773 4772 4760 4768 - 3829 4777 4760 4776 4762 - 3830 4781 4773 4777 4760 - 126 165 162 166 161 - 3795 4744 4789 4790 4788 - 3794 4734 4776 4789 4787 - 3795 4742 4734 4744 4789 - 3863 4938 4837 4937 4825 - 3792 4736 4735 4742 4734 - 3799 4782 4781 4735 4777 - 3799 4748 4782 4736 4735 - 99 144 146 170 171 - 3972 4948 4950 4947 4949 - 3975 5080 5107 4950 5113 - 3968 5084 5080 4948 4950 - 3847 4935 4813 4837 4836 - 3839 4771 4775 4772 4764 - 3815 4750 4749 4775 4774 - 3811 4757 4750 4771 4775 - 3862 4944 4935 4938 4837 - 3813 4751 4800 4749 4799 - 3654 5180 5179 4800 4587 - 3812 5178 5180 4751 4800 - 3814 4749 4799 4774 4798 - 3811 4758 4752 4757 4750 - 3810 5177 5178 4752 4751 - 3809 5176 5177 4758 4752 - 2442 3116 3117 3061 3058 - 3947 4908 4909 4933 4906 - 3950 4907 4937 4909 4946 - 3949 4911 4907 4908 4909 - 3544 4410 4412 4409 4404 - 3820 4778 4754 4783 4756 - 3798 4747 4783 4748 4782 - 3781 4725 4778 4747 4783 - 3540 4400 4399 4412 4411 - 3808 5175 5176 4759 4758 - 3816 5174 5175 4779 4759 - 3819 4779 4759 4754 4753 - 3542 4401 4400 4410 4412 - 3777 4723 4780 4725 4778 - 3776 5173 5174 4780 4779 - 3779 5172 5173 4723 4780 - 208 270 282 289 287 - 439 580 581 572 564 - 445 579 580 577 572 - 436 577 572 576 566 - 3836 4774 4798 4766 4797 - 1932 2503 2361 2516 2365 - 1935 2517 2363 2361 2362 - 1934 2502 2517 2503 2361 - 567 740 761 829 831 - 1931 2414 2417 2363 2416 - 1979 2400 2404 2417 2403 - 1978 2510 2400 2414 2417 - 534 760 713 761 711 - 1864 2283 2518 2502 2517 - 1976 2501 2510 2518 2414 - 1866 2351 2501 2283 2518 - 376 489 496 488 487 - 1244 1555 1553 1554 1611 - 1247 1563 1593 1553 1594 - 1246 1562 1563 1555 1553 - 3653 4799 4584 4798 4605 - 2526 3138 3142 3140 3139 - 2525 3194 3140 5179 5154 - 2524 3193 3138 3194 3140 - 12 56 52 73 74 - 2516 3143 3134 3142 3141 - 2519 3136 3146 3134 3148 - 2518 3135 3136 3143 3134 - 566 742 760 740 761 - 2527 3192 3144 3193 3138 - 2515 3133 3135 3144 3143 - 2515 3191 3133 3192 3144 - 102 147 1322 146 1324 - 383 501 500 493 492 - 383 499 501 490 493 - 382 490 493 491 497 - 15 53 1318 52 1317 - 2506 3185 3187 3146 3122 - 2559 3167 3168 3187 3186 - 2559 3177 3167 3185 3187 - 665 860 859 861 858 - 2553 3169 3175 3168 3166 - 2548 3171 3162 3175 3174 - 2552 3172 3171 3169 3175 - 14 57 53 56 52 - 2558 3176 3173 3177 3167 - 2556 3170 3172 3173 3169 - 2544 3178 3170 3176 3173 - 98 153 147 144 146 - 1271 1594 1577 1610 1605 - 1269 1579 1578 1577 1606 - 1270 1593 1579 1594 1577 - 1237 1552 1557 1609 1601 - 2539 3152 3154 3184 3183 - 2514 3190 3184 3191 3133 - 2539 3189 3152 3190 3184 - 1187 1609 1601 1607 1507 - 2532 3157 3178 3160 3176 - 2541 3158 3157 3155 3160 - 2540 3155 3160 3154 3153 - 1243 1557 1554 1601 1603 - 2538 3188 3159 3189 3152 - 2536 3156 3158 3159 3155 - 2528 3195 3156 3188 3159 - 210 274 283 270 282 - 2016 2450 5191 2449 2447 - 2022 2457 2450 2455 2449 - 2023 2455 2449 11 2448 - 3921 4931 4934 4891 4895 - 166 233 224 249 250 - 167 226 353 224 358 - 165 225 226 233 224 - 1003 1234 1269 1259 1268 - 68 182 183 117 123 - 70 361 182 354 117 - 71 354 117 353 118 - 962 1267 1202 1269 1200 - 163 230 223 225 226 - 160 346 361 223 354 - 162 344 346 230 223 - 3612 4578 4580 4535 4582 - 683 1329 869 1328 969 - 680 871 875 869 877 - 682 1327 871 1329 869 - 4094 5065 5070 5048 5069 - 2462 3071 3080 3075 3082 - 2461 3073 3075 3076 3074 - 2460 3072 3071 3073 3075 - 1002 1236 1267 1234 1269 - 2450 3077 3076 3067 3069 - 2449 3219 3077 3218 3067 - 2451 3218 3067 5172 5173 - 1053 1367 1366 1364 1365 - 2448 3217 3078 3219 3077 - 2459 3070 3072 3078 3073 - 2458 3216 3070 3217 3078 - 3635 4580 4556 4582 4557 - 685 895 894 870 876 - 686 1326 895 1325 870 - 687 1325 870 1327 871 - 3929 4934 4933 4895 4929 - 2489 3101 3106 3100 3098 - 2484 3103 3195 3106 3188 - 2488 3104 3103 3101 3106 - 1043 1359 1401 1366 1400 - 2495 3108 3105 3107 3099 - 2492 3102 3104 3105 3101 - 2480 3109 3102 3108 3105 - 1041 1361 1359 1367 1366 - 2444 3114 3115 3080 3059 - 2494 3099 3100 3115 3116 - 2495 3107 3099 3114 3115 - 3608 4535 4582 4533 4581 - 678 876 872 875 874 - 679 897 896 872 867 - 684 894 897 876 872 - 3759 4712 4698 4715 4694 - 2456 3215 3113 3216 3070 - 2475 3214 3086 3215 3113 - 2474 3086 3088 3113 3112 - 3758 4715 4694 4721 4720 - 2477 3089 3093 3088 3087 - 2468 3091 3109 3093 3108 - 2476 3092 3091 3089 3093 - 3759 4698 4697 4694 4696 - 2475 3213 3094 3214 3086 - 2472 3090 3092 3094 3089 - 2464 3223 3090 3213 3094 - 664 855 860 853 861 - 380 506 499 509 490 - 346 469 506 455 509 - 347 455 509 457 508 - 1843 2273 2256 2272 2279 - 3454 5228 4306 5227 5263 - 3451 4298 4302 4306 4305 - 3450 4300 4298 5228 4306 - 1842 2264 2263 2256 2258 - 1839 2250 5226 2252 2251 - 3455 5262 4300 5226 5228 - 1839 5203 5262 2250 5226 - 1613 2007 2080 2029 2008 - 1827 2254 2255 2248 2266 - 1835 2251 5227 2255 2268 - 1832 2252 2251 2254 2255 - 1751 2158 2192 2151 2193 - 1720 2203 2207 2122 2206 - 1775 2168 2172 2207 2173 - 1774 2185 2168 2203 2207 - 1843 2262 2264 2273 2256 - 3538 4402 4401 4398 4410 - 3539 4415 4398 4414 4408 - 3537 4460 4402 4415 4398 - 303 404 407 406 405 - 3543 4467 4470 4401 4400 - 3474 4340 4339 4470 4469 - 3473 4342 4340 4467 4470 - 2258 2846 2837 2840 2839 - 3536 4459 4468 4460 4402 - 3472 4343 4342 4468 4467 - 3477 4357 4343 4459 4468 - 1750 2151 2193 2208 2202 - 1723 2125 2122 2124 2128 - 1725 2123 2125 2133 2124 - 1724 2133 2124 2132 2134 - 302 402 406 423 425 - 3462 4330 4331 4457 4456 - 3463 4329 4357 4331 4459 - 3456 4328 4329 4330 4331 - 3928 4891 4895 4893 4890 - 3508 4388 4386 4370 4385 - 3459 4337 4328 4386 4330 - 3458 4387 4337 4388 4386 - 303 403 404 402 406 - 3506 4369 4463 4458 4461 - 3505 4385 4457 4463 4462 - 3504 4370 4385 4369 4463 - 1787 2193 2180 2202 2178 - 1311 1645 1750 1904 1903 - 1346 1751 1687 1750 1752 - 1298 1637 1751 1645 1750 - 450 592 647 590 652 - 3535 4394 4395 4454 4453 - 3531 5225 4454 5224 5257 - 3531 4393 4394 5225 4454 - 451 590 652 617 651 - 3526 4392 4414 4390 4413 - 3525 4391 4392 4396 4390 - 3527 4396 4390 4395 4452 - 499 647 632 652 630 - 3533 4465 4397 4393 4394 - 3524 4461 4391 4397 4396 - 3532 4458 4461 4465 4397 - 26 115 65 114 63 - 1596 2220 2532 1983 2531 - 1598 2216 2220 1984 1983 - 1599 1984 1983 1992 2534 - 3165 4003 3998 4002 4496 - 715 910 968 933 967 - 740 965 936 968 935 - 715 912 965 910 968 - 3163 3996 3995 3998 3997 - 747 977 976 943 942 - 738 975 977 934 943 - 739 934 943 936 937 - 2243 2825 2850 2827 2851 - 719 911 966 912 965 - 736 974 975 966 934 - 718 973 974 911 966 - 982 1216 2616 1215 2615 - 195 258 269 260 268 - 202 287 286 269 262 - 193 289 287 258 269 - 3164 4004 3996 4003 3998 - 3948 4910 4911 4912 4908 - 3942 4903 4912 4931 4934 - 3943 4904 4910 4903 4912 - 1612 2029 2008 2028 2027 - 3920 4932 4931 4885 4891 - 3922 5223 4932 5222 4885 - 3923 5222 4885 5229 4887 - 158 217 214 219 222 - 3938 5221 4902 5223 4932 - 3939 4905 4904 4902 4903 - 3937 5220 4905 5221 4902 - 1791 2192 2191 2193 2180 - 1726 2200 2201 2123 2125 - 1784 2177 2185 2201 2203 - 1786 2178 2177 2200 2201 - 145 206 205 214 203 - 3961 4920 4923 4918 4922 - 3955 5171 5170 4923 4913 - 3960 5169 5171 4920 4923 - 144 215 206 217 214 - 3965 5219 4917 5218 4919 - 3967 5168 5169 4917 4920 - 3966 5217 5168 5219 4917 - 3475 4350 4352 4342 4340 - 3936 5216 4928 5220 4905 - 3963 4919 4918 4928 4927 - 3964 5218 4919 5216 4928 - 979 1215 2615 4 2614 - 3150 4036 4038 3987 4007 - 3195 4022 4021 4038 4037 - 3195 4032 4022 4036 4038 - 47 350 83 349 104 - 3951 4926 4925 4911 4907 - 3953 4921 4915 4926 4925 - 3958 4915 4944 4925 4938 - 46 351 84 350 83 - 3844 5167 5166 4936 4816 - 3957 5165 5167 4914 4936 - 3959 4914 4936 4944 4935 - 45 84 89 83 91 - 3952 4913 4916 4921 4915 - 3956 5164 5165 4916 4914 - 3954 5170 5164 4913 4916 - 49 94 115 92 114 - 1879 2534 2533 2541 2291 - 1579 1992 2534 1975 2541 - 1578 1975 2541 1973 2540 - 2246 2828 2827 5158 5159 - 723 918 964 1312 1304 - 764 962 952 964 951 - 722 920 962 918 964 - 1852 2268 5215 2267 2260 - 762 950 955 952 954 - 761 959 960 955 953 - 760 958 959 950 955 - 3428 5263 4286 5215 5214 - 727 919 963 920 962 - 763 967 958 963 950 - 727 933 967 919 963 - 1711 2218 2113 2217 2216 - 3199 4023 4028 4021 4040 - 3190 4026 4050 4028 4049 - 3196 4027 4026 4023 4028 - 1834 5227 5263 2268 5215 - 2299 2868 2870 2887 2886 - 2267 2885 2887 2852 2844 - 2299 2878 2868 2885 2887 - 1908 2324 2323 2319 2322 - 2301 2871 2875 2870 2869 - 2292 2873 3223 2875 3213 - 2300 2874 2873 2871 2875 - 2242 2834 2825 2828 2827 - 2298 2877 2876 2878 2868 - 2296 2872 2874 2876 2871 - 2288 2879 2872 2877 2876 - 979 1213 1215 1212 4 - 3193 4025 4027 4024 4023 - 3192 4031 4025 4030 4024 - 3194 4030 4024 4032 4022 - 1906 2329 2318 2324 2323 - 2251 2894 2884 2893 2832 - 2283 2856 2858 2884 2883 - 2283 2892 2856 2894 2884 - 1907 2318 2514 2323 2513 - 2285 2859 2863 2858 2857 - 2276 2861 2879 2863 2877 - 2284 2862 2861 2859 2863 - 1627 2069 2029 2068 2028 - 2282 2891 2864 2892 2856 - 2280 2860 2862 2864 2859 - 2272 2895 2860 2891 2864 - 978 1224 1216 1213 1215 - 1307 1640 1646 1641 1642 - 1299 1643 1637 1646 1645 - 1307 1644 1643 1640 1646 - 2018 5191 5273 2447 5213 - 2247 2835 2834 2826 2828 - 2245 2890 2826 5166 5157 - 2244 2889 2835 2890 2826 - 3562 5273 4420 5213 5212 - 2252 2830 2852 2833 2849 - 2254 2831 2830 2836 2833 - 2240 2836 2833 2834 2825 - 2019 2447 5213 2446 23 - 2248 2888 2829 2889 2835 - 2250 2832 2831 2829 2836 - 2251 2893 2832 2888 2829 - 50 92 114 185 187 - 1807 2538 2228 2350 2349 - 1806 2532 2538 2531 2350 - 1878 2531 2350 2533 2292 - 3582 4453 4455 4435 4436 - 751 944 939 960 961 - 748 940 986 939 991 - 750 941 940 944 939 - 79 120 174 140 173 - 617 833 832 794 796 - 618 988 833 987 794 - 619 987 794 986 801 - 111 175 150 174 149 - 746 942 938 941 940 - 744 972 988 938 987 - 747 976 972 942 938 - 1710 2118 2117 2218 2113 - 1895 2543 2309 21 2308 - 1893 2315 2314 2309 2307 - 1894 2530 2315 2543 2309 - 78 121 175 120 174 - 2202 2782 2781 2777 2779 - 2203 5211 2777 5217 5168 - 2201 5210 2782 5211 2777 - 3391 4236 4239 4474 4476 - 2199 2774 2775 2780 2776 - 2198 5209 2774 5208 2780 - 2200 5208 2780 5210 2782 - 3583 4452 4451 4455 4438 - 2195 2776 2784 2781 2783 - 2194 2773 2788 2784 2787 - 2195 2775 2773 2776 2784 - 1595 2215 2218 1989 2217 - 1899 2310 2312 2316 2313 - 1898 2529 2310 2528 2316 - 1892 2528 2316 2530 2315 - 3534 4395 4452 4453 4455 - 2187 2822 2824 2788 2768 - 2239 2806 2807 2824 2823 - 2239 2815 2806 2822 2824 - 3390 4242 4237 4236 4239 - 2233 2808 2813 2807 2805 - 2228 2810 2895 2813 2891 - 2232 2811 2810 2808 2813 - 3386 4237 4233 4239 4238 - 2238 2814 2812 2815 2806 - 2236 2809 2811 2812 2808 - 2224 2816 2809 2814 2812 - 1702 2110 2118 2215 2218 - 1902 2313 2311 2314 2317 - 1903 2332 2337 2311 2336 - 1901 2312 2332 2313 2311 - 1151 1447 2555 2585 2584 - 2196 5207 2821 5209 2774 - 2219 5206 2793 5207 2821 - 2218 2793 2795 2821 2820 - 1150 1445 2585 1928 2583 - 2221 2796 2800 2795 2794 - 2212 2798 2816 2800 2814 - 2220 2799 2798 2796 2800 - 1151 1446 1447 1445 2585 - 2219 5205 2801 5206 2793 - 2216 2797 2799 2801 2796 - 2208 5281 2797 5205 2801 - 3787 4732 4731 4726 4730 - 918 1156 1150 1271 1272 - 919 1152 2599 1150 2598 - 917 1151 1152 1156 1150 - 2554 3168 3166 3186 3182 - 4018 4984 4983 5067 4982 - 4023 5001 4989 4983 4987 - 4022 5000 5001 4984 4983 - 441 674 569 673 568 - 4031 5002 5006 4989 4993 - 4014 4997 4998 5006 4977 - 3982 4967 4997 5002 5006 - 443 571 576 569 575 - 3999 4963 5003 5000 5001 - 3998 4970 4967 5003 5002 - 3995 4961 4970 4963 5003 - 3522 4457 4456 4462 4389 - 48 95 113 94 115 - 28 112 75 113 64 - 53 107 112 95 113 - 440 672 571 674 569 - 1662 2052 2054 2050 2057 - 1663 2567 2050 2566 24 - 1661 2565 2052 2567 2050 - 543 720 715 826 827 - 1653 2055 2053 2054 2049 - 1654 2061 2060 2053 2062 - 1658 2059 2061 2055 2053 - 1981 2507 2407 2401 2399 - 1660 2564 2051 2565 2052 - 1659 2058 2059 2051 2055 - 1657 2563 2058 2564 2051 - 3520 4456 4460 4389 4415 - 2511 3132 3131 3130 3129 - 2510 3124 3123 3131 3128 - 2511 3125 3124 3132 3131 - 542 719 716 720 715 - 1633 2036 2037 2060 2034 - 1638 2035 2070 2037 2069 - 1637 2043 2035 2036 2037 - 651 849 848 847 846 - 1615 2078 2082 2070 2007 - 1572 1999 1971 2082 2081 - 1565 1961 1999 2078 2082 - 540 716 2607 715 2609 - 1639 2077 2079 2043 2035 - 1564 1960 1961 2079 2078 - 1566 2075 1960 2077 2079 - 67 116 124 143 142 - 59 96 101 183 184 - 51 100 94 101 92 - 59 102 100 96 101 - 3851 4822 4818 4835 4834 - 1643 2574 2040 2578 2038 - 1656 2562 2578 2563 2058 - 1451 1792 2574 2562 2578 - 3855 4820 4817 4822 4818 - 1646 2041 2039 2040 2044 - 1647 2076 2075 2039 2077 - 1645 2074 2076 2041 2039 - 3854 4817 5111 4818 5110 - 1451 1793 2575 1792 2574 - 1644 2576 2074 2575 2041 - 1450 1798 2576 1793 2575 - 1219 1545 1537 1561 1562 - 663 858 851 976 972 - 660 852 981 851 989 - 662 859 852 858 851 - 1111 1472 1471 1418 1422 - 1942 2382 2413 2383 2368 - 1972 2418 2396 2413 2412 - 1940 2415 2418 2382 2413 - 1106 1416 1418 1429 1423 - 1961 2405 2488 2408 2489 - 1970 2409 2405 2397 2408 - 1975 2397 2408 2396 2487 - 650 838 847 862 863 - 1969 2403 2398 2415 2418 - 1971 2402 2409 2398 2397 - 1968 2404 2402 2403 2398 - 3523 4462 4389 4391 4392 - 55 103 93 102 100 - 52 106 107 93 95 - 54 105 106 103 93 - 1445 1791 1790 1796 1797 - 122 171 1323 160 1314 - 123 164 160 165 162 - 121 170 171 164 160 - 1111 1417 1472 1416 1418 - 117 158 165 167 166 - 116 159 158 157 167 - 118 157 167 694 695 - 1344 1749 1688 1751 1687 - 115 156 163 159 158 - 120 169 170 163 164 - 114 168 169 156 163 - 3490 4480 4478 4360 4384 - 2256 2848 2845 2846 2837 - 2268 2842 3216 2845 3217 - 2270 2843 2842 2848 2845 - 1441 1910 1909 1791 1790 - 108 149 152 168 169 - 96 154 153 152 144 - 110 150 154 149 152 - 651 842 849 838 847 - 103 155 145 153 147 - 101 181 1328 145 1321 - 100 180 181 155 145 - 1443 1909 1908 1790 1795 - 106 151 148 150 154 - 104 179 180 148 155 - 107 186 179 151 148 - 3489 4384 4383 4358 4387 - 82 139 137 138 130 - 83 141 140 137 128 - 92 142 141 139 137 - 3569 4450 4449 4432 4430 - 707 1324 906 1323 930 - 706 1322 909 1324 906 - 705 909 908 906 913 - 3570 4432 4430 4441 4446 - 710 914 917 908 916 - 717 971 973 917 911 - 716 970 971 914 917 - 3571 4449 4448 4430 25 - 709 1321 907 1322 909 - 711 969 970 907 914 - 708 1328 969 1321 907 - 1217 1547 1540 1546 1545 - 659 856 850 859 852 - 656 865 983 850 982 - 658 864 865 856 850 - 714 916 912 915 910 - 1384 1937 1932 1716 1931 - 1148 1448 1446 1932 1445 - 1139 1442 1448 1937 1932 - 4059 5027 5022 5026 5064 - 1147 1444 1453 1446 1447 - 1123 1451 1431 1453 2554 - 1146 1452 1451 1444 1453 - 4055 5020 5026 5061 5062 - 1142 1441 1449 1442 1448 - 1144 1450 1452 1449 1444 - 1141 1461 1450 1441 1449 - 3471 4478 4477 4384 4383 - 2261 2851 2838 5160 5161 - 2263 2847 2846 2838 2840 - 2260 2850 2847 2851 2838 - 4020 5070 4984 5069 5067 - 667 904 855 905 853 - 673 866 905 973 974 - 672 868 904 866 905 - 4052 5019 5027 5020 5026 - 670 854 857 855 860 - 669 863 864 857 856 - 668 862 863 854 857 - 4043 5017 5014 5010 5015 - 677 867 903 868 904 - 671 902 862 903 854 - 676 896 902 867 903 - 1963 2392 2391 2405 2488 - 2264 2849 2841 2850 2847 - 2266 2844 2843 2841 2848 - 2267 2852 2844 2849 2841 - 4034 5007 5008 5014 5009 - 647 846 836 864 865 - 644 837 992 836 984 - 646 848 837 846 836 - 1614 2070 2007 2069 2029 - 547 993 727 992 729 - 546 726 5281 727 734 - 547 978 726 993 727 - 4035 5016 5007 5017 5014 - 643 843 835 848 837 - 640 834 978 835 993 - 642 844 834 843 835 - 3488 4360 4384 4359 4358 - 1499 1916 1921 1848 1838 - 1309 1642 1904 1921 1920 - 1308 1641 1642 1916 1921 - 3962 4922 4921 4924 4926 - 648 900 842 901 838 - 699 883 901 896 902 - 698 886 900 883 901 - 3940 4927 4924 4904 4910 - 654 841 845 842 849 - 653 839 844 845 843 - 652 840 839 841 845 - 3963 4918 4922 4927 4924 - 703 885 899 886 900 - 655 898 840 899 841 - 703 884 898 885 899 - 1216 1540 1539 1545 1537 - 575 989 744 988 833 - 574 745 748 744 750 - 575 981 745 989 744 - 3191 4020 4019 4027 4026 - 1121 1433 2582 1431 2580 - 1134 1438 2560 2582 2581 - 1120 1439 1438 1433 2582 - 85 130 131 135 129 - 1754 2571 2153 2579 2157 - 1062 1469 2571 2561 2579 - 1747 2561 2579 2560 2150 - 3189 4046 4045 4019 4051 - 1135 1437 1436 1439 1438 - 1133 1470 1469 1436 2561 - 1132 1475 1470 1437 1436 - 1963 2410 2392 2409 2405 - 3833 4765 4796 4763 4795 - 3668 4797 4596 4796 4603 - 3832 4766 4797 4765 4796 - 2191 2823 2819 2767 2766 - 18 74 1326 60 1325 - 19 70 60 69 62 - 17 73 74 70 60 - 3188 5141 4046 4020 4019 - 25 66 69 68 67 - 24 65 66 63 68 - 27 63 68 186 179 - 704 908 916 913 915 - 31 64 71 65 66 - 16 72 73 71 70 - 30 75 72 64 71 - 1962 2393 2394 2410 2392 - 3827 4768 4770 4761 4769 - 3838 4764 4766 4770 4765 - 3839 4772 4764 4768 4770 - 2234 2807 2805 2823 2819 - 700 891 892 890 887 - 702 889 884 892 885 - 689 878 889 891 892 - 2235 2805 2892 2819 2894 - 688 879 878 880 891 - 690 1320 879 1319 880 - 691 1319 880 1318 882 - 712 913 915 931 932 - 695 1317 881 1326 895 - 694 882 890 881 888 - 695 1318 882 1317 881 - 1965 2394 2390 2392 2391 - 3824 4786 4767 4785 4792 - 3835 4769 4763 4767 4806 - 3827 4761 4769 4786 4767 - 20 62 1327 61 1329 - 7 48 46 75 72 - 6 47 58 48 46 - 4 58 57 46 56 - 22 69 62 67 61 - 10 51 54 57 53 - 9 49 1320 54 1319 - 8 50 49 51 54 - 23 67 61 180 181 - 3 45 59 47 58 - 11 55 50 59 51 - 3 44 55 45 59 - 1430 1774 1773 1770 1772 - 187 251 240 253 238 - 138 196 251 194 253 - 139 194 253 682 683 - 1348 1748 1755 1749 1688 - 3326 4147 4515 4502 4513 - 3308 4153 4491 4515 4514 - 3318 4140 4153 4147 4515 - 1175 1489 1622 1497 1625 - 3317 4141 4140 4148 4147 - 3316 4145 4141 4146 4148 - 3321 4146 4148 4143 4144 - 1178 1495 1497 1493 1494 - 3325 5204 4505 5203 5262 - 3327 4144 4502 4505 4504 - 3324 4143 4144 5204 4505 - 1437 1816 1778 1776 1777 - 306 413 410 412 420 - 307 425 424 410 418 - 305 423 425 413 410 - 3179 4011 4032 4035 4036 - 3187 4029 4018 4031 4025 - 3186 5138 5142 4029 4018 - 3185 5142 5141 4018 4020 - 1174 1490 1489 1495 1497 - 3605 4531 4539 5127 5128 - 3604 4807 4531 5143 5127 - 3184 5143 5127 5141 4046 - 972 1206 2595 1205 2596 - 3740 4684 4679 5138 5142 - 3743 4680 4807 4679 5143 - 3742 4685 4680 4684 4679 - 1435 1778 1786 1777 1775 - 311 431 412 430 411 - 266 371 431 369 430 - 267 369 430 515 516 - 975 1210 1205 1228 1227 - 3722 4670 4668 5148 5139 - 3723 4686 4685 4668 4684 - 3721 4672 4686 4670 4668 - 3180 4012 4035 4039 4034 - 3726 4671 4672 4673 4670 - 3719 4665 4671 5105 4673 - 3727 5105 4673 5104 4669 - 974 1209 1206 1210 1205 - 3175 5124 5126 4181 4009 - 3725 4669 5148 5126 5125 - 3724 5104 4669 5124 5126 - 1434 1818 1817 1778 1786 - 535 711 721 830 825 - 533 712 719 721 720 - 532 713 712 711 721 - 906 1197 1142 1199 1140 - 3285 4120 4184 4121 4183 - 3286 4190 4182 4120 4184 - 3182 4182 4013 4184 4012 - 943 1169 1199 1189 1198 - 3178 4015 4017 4013 4011 - 3176 4014 4031 4017 4030 - 3169 4010 4014 4015 4017 - 942 1171 1197 1169 1199 - 3183 4191 4016 4190 4182 - 3172 4009 4010 4016 4015 - 3174 4181 4009 4191 4016 - 1318 1662 1661 1655 1660 - 569 754 753 743 749 - 570 983 754 982 743 - 571 982 743 981 745 - 286 392 389 396 395 - 1140 1460 1463 1461 1450 - 1127 1459 1462 1463 1432 - 1101 1427 1459 1460 1463 - 273 381 380 389 378 - 1131 1465 1475 1435 1437 - 1130 1458 1465 1457 1435 - 1128 1457 1435 1462 1440 - 3179 4013 4011 4012 4035 - 1114 1428 1419 1427 1459 - 1115 1421 1458 1419 1457 - 1113 1424 1421 1428 1419 - 1831 2536 2253 2278 2249 - 271 370 432 371 431 - 304 428 423 432 413 - 271 394 428 370 432 - 2555 3166 27 3182 3181 - 627 802 813 1307 1295 - 627 810 812 802 813 - 634 812 808 813 806 - 1551 2559 1949 2558 1948 - 637 817 816 807 809 - 638 815 817 811 807 - 639 811 807 810 812 - 272 390 381 392 389 - 633 809 824 808 823 - 591 822 787 824 769 - 636 816 822 809 824 - 1436 1819 1818 1816 1778 - 2499 3126 3130 3118 3121 - 2498 3147 3126 3145 3118 - 2496 3145 3118 5183 5185 - 3483 4346 4345 4352 4351 - 623 795 821 798 820 - 595 831 830 821 777 - 623 829 831 795 821 - 1545 2569 1947 1949 1954 - 622 828 829 797 795 - 616 832 828 796 797 - 620 796 797 800 799 - 1544 2557 2569 2559 1949 - 614 793 792 815 817 - 609 799 798 792 790 - 608 800 799 793 792 - 3223 5120 4165 4063 4164 - 1420 1788 1762 1787 1761 - 1423 1784 1763 1762 1765 - 1422 1789 1784 1788 1762 - 175 228 234 247 248 - 597 775 785 787 789 - 598 776 784 775 785 - 603 784 783 785 778 - 164 232 225 234 233 - 607 782 779 783 781 - 606 827 2608 779 2587 - 607 826 827 782 779 - 174 229 232 228 234 - 594 777 774 776 784 - 592 825 826 774 782 - 595 830 825 777 774 - 1226 1546 1545 1560 1561 - 3279 4157 4118 4160 4159 - 3306 4158 4157 4136 4160 - 3307 4136 4160 4149 4137 - 959 1181 1196 1274 1276 - 1516 1860 1863 1859 1857 - 1518 1885 1886 1863 1855 - 1505 1884 1885 1860 1863 - 3263 5114 4101 5272 4108 - 1504 1883 1884 1849 1860 - 1506 2618 1883 2617 1849 - 1507 2617 1849 2616 1853 - 1729 5202 5272 2556 5271 - 1511 2615 1852 2614 1851 - 1510 1853 1859 1852 1858 - 1511 2616 1853 2615 1852 - 1828 2527 2546 2536 2253 - 531 710 718 713 712 - 539 725 724 718 717 - 530 723 725 710 718 - 3262 5258 5114 5202 5272 - 1219 1537 1592 1562 1563 - 1218 1539 1595 1537 1592 - 1279 1595 1581 1592 1591 - 920 1195 1155 1196 1153 - 1273 5222 5229 1582 1580 - 1272 5223 5222 1583 1582 - 1278 1583 1582 1581 1586 - 3400 4266 4323 4264 4322 - 1223 1538 1596 1539 1595 - 1276 5221 5223 1596 1583 - 1223 5220 5221 1538 1596 - 1836 2546 2252 2253 2254 - 1411 1756 2563 1767 2564 - 1410 1757 1756 1768 1767 - 1419 1768 1767 1763 1766 - 596 819 775 822 787 - 1220 1548 1544 1547 1540 - 1222 5216 5220 1544 1538 - 1229 5218 5216 1548 1544 - 3406 4324 4321 4266 4323 - 1228 5219 5218 1542 1548 - 1230 5217 5219 2613 1542 - 1231 2613 1542 2612 1543 - 3356 4321 4213 4323 4250 - 1225 2611 1541 2610 1559 - 1227 1543 1547 1541 1546 - 1224 2612 1543 2611 1541 - 3365 4258 4257 4221 4244 - 1426 1781 1785 1789 1784 - 1415 1783 1757 1785 1768 - 1433 1775 1783 1781 1785 - 1772 2186 2184 2185 2168 - 1239 2609 1550 2608 1608 - 1238 2607 1551 2609 1550 - 1236 1551 1552 1550 1609 - 1771 2188 2187 2184 2169 - 1234 1556 1558 1552 1557 - 1241 1561 1562 1558 1555 - 1240 1560 1561 1556 1558 - 1781 2182 2188 2186 2184 - 1233 2606 1549 2607 1551 - 1235 1559 1560 1549 1556 - 1232 2610 1559 2606 1549 - 1317 1660 1677 1679 1678 - 3311 4137 4138 4152 4139 - 3310 4149 4137 4154 4152 - 3319 4154 4152 4140 4153 - 615 801 800 791 793 - 2003 2442 2445 2428 2430 - 1994 2423 2421 2445 2444 - 1997 2490 2423 2442 2445 - 613 991 791 990 814 - 1990 2420 5224 2425 2419 - 1986 2424 2420 2427 2425 - 1995 2427 2425 2421 2426 - 3483 4353 4346 4350 4352 - 1999 2489 2422 2490 2423 - 1998 2494 2424 2422 2427 - 1999 2488 2494 2489 2422 - 3351 4209 4219 4258 4257 - 3251 4092 4090 5254 4085 - 3252 4091 4086 4090 4087 - 3258 4088 4091 4092 4090 - 2038 2460 5201 2476 2470 - 572 750 747 832 828 - 565 748 741 750 747 - 564 741 742 747 740 - 958 1183 1195 1181 1196 - 563 752 751 746 739 - 568 753 752 749 746 - 560 749 746 748 741 - 612 986 801 991 791 - 562 739 759 742 760 - 528 758 723 759 710 - 563 751 758 739 759 - 3367 4259 4258 4222 4221 - 1743 5255 5254 2195 5195 - 1743 2143 5255 2145 2195 - 1737 2145 2195 2189 2194 - 2039 5238 5261 2460 5201 - 522 701 709 723 725 - 521 708 707 709 706 - 520 703 708 701 709 - 3554 5261 4443 5201 5268 - 512 5211 5217 698 2613 - 514 5210 5211 704 698 - 515 704 698 707 700 - 1279 1581 1586 1591 1584 - 526 702 705 703 708 - 525 5208 5210 705 704 - 524 5209 5208 702 705 - 3351 4215 4209 4259 4258 - 1742 5200 5256 2143 5255 - 3259 4089 4088 5256 4092 - 3261 5270 4089 5200 5256 - 1513 1857 1856 1862 1854 - 558 735 736 753 752 - 549 730 737 735 736 - 559 737 733 736 732 - 1512 1859 1857 1858 1862 - 555 731 757 733 756 - 527 5207 5209 757 702 - 554 5206 5207 731 757 - 1508 1858 1862 1850 1861 - 544 734 738 730 737 - 553 5205 5206 738 731 - 552 5281 5205 734 738 - 1428 1770 1772 1769 1771 - 191 248 249 242 241 - 191 247 248 239 242 - 190 239 242 240 245 - 3647 4562 17 4561 4560 - 3234 4076 4171 5278 4170 - 3320 4172 4145 4171 4146 - 3233 4077 4172 4076 4171 - 1297 1638 1749 1637 1751 - 3235 4078 4077 5280 4076 - 3238 5199 4078 5198 5280 - 1770 5198 5280 2187 5279 - 3644 4565 4561 5152 5147 - 1769 2169 2171 2172 2170 - 1768 5279 5278 2171 5193 - 1771 2187 5279 2169 2171 - 3828 4776 4762 4787 4784 - 81 128 172 131 178 - 112 173 168 172 156 - 80 140 173 128 172 - 3646 4566 4562 4565 4561 - 3619 4546 4554 4544 4571 - 3620 4553 4555 4546 4554 - 3631 4555 4552 4554 4551 - 3880 4941 4871 4940 4844 - 3702 4629 4624 4660 4659 - 3699 4627 4629 4658 4660 - 3630 4658 4660 4552 4550 - 1980 2500 2507 2511 2401 - 3628 4656 4657 4553 4555 - 3699 4634 4627 4657 4658 - 3698 4636 4634 4656 4657 - 3747 4784 4786 4693 4785 - 3480 4476 4475 4348 4344 - 3485 4474 4476 4349 4348 - 3482 4349 4348 4353 4346 - 3859 4823 4831 4871 4870 - 3586 4654 4655 4576 4583 - 3711 4631 4636 4655 4656 - 3711 4644 4631 4654 4655 - 1266 1591 1584 1593 1579 - 3678 4661 4644 4653 4654 - 3589 4662 4661 4521 4653 - 3585 4521 4653 4526 4519 - 3858 4945 4823 4941 4871 - 3598 4529 4528 4524 4523 - 3587 4519 4576 4528 4575 - 3584 4526 4519 4529 4528 - 3750 4787 4784 4691 4693 - 3476 4356 4341 4357 4343 - 3479 4354 4353 4341 4350 - 3478 4355 4354 4356 4341 - 1478 1832 1824 1825 1826 - 3610 4538 4536 4539 4534 - 3614 4537 4540 4538 4536 - 3615 4540 4578 4536 4535 - 1479 1824 1846 1826 1847 - 3632 4579 4573 4578 4580 - 3618 4545 4544 4573 4572 - 3617 4577 4545 4579 4573 - 1474 1825 1826 1884 1885 - 3600 4541 4543 4537 4540 - 3596 4523 4577 4543 4579 - 3599 4524 4523 4541 4543 - 1319 1655 1660 1653 1679 - 3305 4156 4158 4151 4136 - 3304 4168 4156 4167 4151 - 3315 4167 4151 4166 4150 - 1267 1586 1585 1584 1576 - 1495 1847 1891 1886 1889 - 1494 1836 1837 1891 1890 - 1495 1846 1836 1847 1891 - 3872 4939 4840 5078 5079 - 1502 1841 1901 1844 1900 - 1497 1840 1841 1843 1844 - 1491 1843 1844 1837 1835 - 3875 4847 4842 4840 4843 - 1492 1845 1842 1846 1836 - 1496 1838 1840 1842 1843 - 1498 1848 1838 1845 1842 - 3237 5197 4095 5196 5199 - 538 717 714 719 716 - 536 722 2610 714 2606 - 539 724 722 717 714 - 579 762 764 1300 1301 - 948 1185 1188 1277 1273 - 957 1186 1182 1185 1188 - 956 1182 1183 1188 1181 - 3874 4942 4847 4939 4840 - 955 1190 1189 1187 1180 - 944 1191 1190 1184 1187 - 952 1184 1187 1186 1182 - 1296 1684 1748 1638 1749 - 954 1180 1194 1183 1195 - 927 1198 1164 1194 1154 - 955 1189 1198 1180 1194 - 3826 4762 4761 4784 4786 - 3484 4473 4474 4347 4349 - 3486 4472 4473 4487 4347 - 3487 4487 4347 4355 4354 - 582 765 2592 764 2593 - 898 1147 1137 1146 1139 - 899 1301 2601 1137 2602 - 897 1300 1301 1147 1137 - 578 771 765 762 764 - 896 1299 1300 1148 1147 - 910 1298 1299 1141 1148 - 911 1141 1148 1142 1143 - 3624 4551 4549 4570 4569 - 907 1140 1145 1164 1163 - 905 1143 1146 1145 1144 - 904 1142 1143 1140 1145 - 3251 5254 4085 5195 5197 - 1107 1621 1617 1620 1430 - 1104 1618 1417 1617 1416 - 1162 1483 1618 1621 1617 - 1359 1694 1691 1696 1693 - 932 1173 1176 1191 1190 - 941 1174 1170 1173 1176 - 940 1170 1171 1176 1169 - 1357 1709 1712 1694 1691 - 939 1168 1193 1171 1197 - 908 1297 1298 1193 1141 - 939 1296 1297 1168 1193 - 1356 1712 1741 1691 1743 - 936 1172 1175 1174 1170 - 938 1295 1296 1175 1168 - 928 1307 1295 1172 1175 - 1325 1661 1676 1660 1677 - 3314 4150 4149 4142 4154 - 3315 4166 4150 4173 4142 - 3312 4173 4142 4145 4141 - 3627 4550 7 4549 4548 - 1938 2366 2497 2491 2496 - 2000 2498 2428 2497 2441 - 1937 2367 2498 2366 2497 - 2223 2794 2815 2818 2822 - 1939 2368 2367 2377 2366 - 1943 2383 2368 2378 2377 - 1947 2378 2377 2375 2376 - 2192 2820 2818 2775 2773 - 1944 2374 2370 2373 2369 - 1946 2376 2491 2370 2493 - 1947 2375 2376 2374 2370 - 1776 5195 5197 2175 5196 - 1110 1474 1468 1417 1472 - 1039 1353 1368 1468 1369 - 1038 1352 1353 1474 1468 - 551 984 728 983 754 - 1285 2605 1636 2604 1635 - 1294 2603 1631 2605 1636 - 1295 1631 1633 1636 1632 - 2222 2795 2794 2820 2818 - 1339 1679 1678 1683 1682 - 1316 1653 1679 1681 1683 - 1290 1681 1683 1633 1630 - 3626 4552 4550 4551 4549 - 1292 2602 1680 2603 1631 - 1315 1654 1653 1680 1681 - 1315 2601 1654 2602 1680 - 3222 5135 5120 4062 4063 - 1034 1616 1352 1615 1474 - 1254 1565 1616 1600 1615 - 1164 1600 1615 1483 1618 - 2508 3186 3182 3125 3124 - 1287 2600 1629 2599 1628 - 1287 1635 1634 1629 1627 - 1284 2604 1635 2600 1629 - 550 992 729 984 728 - 1281 2599 1628 2598 1919 - 1483 2598 1919 2597 1830 - 1280 1628 1626 1919 1918 - 548 729 730 728 735 - 1283 1627 1649 1626 1648 - 1306 1650 1644 1649 1640 - 1286 1634 1650 1627 1649 - 3250 4085 4098 5197 4095 - 2505 3148 3127 3147 3126 - 2507 3122 3125 3127 3132 - 2504 3146 3122 3148 3127 - 3171 5139 5138 4008 4029 - 1301 1685 1684 1639 1638 - 1303 1651 1639 1644 1643 - 1302 1647 1685 1651 1639 - 3168 5125 4008 4010 4014 - 1343 1670 1669 1684 1748 - 1340 1671 1753 1669 1754 - 1342 1672 1671 1670 1669 - 3170 5148 5139 5125 4008 - 1300 1682 1686 1647 1685 - 1336 1668 1672 1686 1670 - 1339 1678 1668 1682 1686 - 1912 2339 2341 2332 2337 - 3134 3965 3967 3964 3960 - 3124 3956 3955 3967 3966 - 3126 3957 3956 3965 3967 - 3737 4688 4681 4683 4687 - 1473 2596 1823 2618 1883 - 1472 2595 1833 2596 1823 - 1475 1833 1832 1823 1825 - 1304 1648 1641 1917 1916 - 1484 1828 1848 1831 1845 - 1486 1829 1828 1834 1831 - 1476 1834 1831 1832 1824 - 1487 1918 1917 1829 1828 - 1480 2594 1827 2595 1833 - 1482 1830 1829 1827 1834 - 1483 2597 1830 2594 1827 - 903 1144 1138 1162 1161 - 2434 3079 3055 5176 5177 - 2435 3065 3064 3055 3057 - 2433 3081 3065 3079 3055 - 3736 4681 4682 4687 4678 - 1019 1248 1266 1250 1265 - 1019 1259 1268 1248 1266 - 987 1268 1230 1266 1221 - 1282 1626 1648 1918 1917 - 1021 1251 1250 1256 1249 - 1020 1254 1251 1253 1256 - 1012 1253 1256 1244 1255 - 258 374 376 513 514 - 1016 1252 1257 1254 1251 - 1018 1258 1259 1257 1248 - 1008 1260 1258 1252 1257 - 900 1139 2603 1138 2605 - 2432 3082 3066 3081 3065 - 2447 3059 3061 3066 3060 - 2446 3080 3059 3082 3066 - 265 375 372 374 376 - 964 1276 1275 1264 1201 - 1007 1235 1264 1236 1267 - 1007 1274 1276 1235 1264 - 264 372 371 376 369 - 1006 1273 1274 1241 1235 - 992 1277 1273 1240 1241 - 1004 1240 1241 1239 1237 - 3198 4021 4040 4037 4033 - 996 1238 1242 1260 1258 - 1001 1237 1236 1242 1234 - 1000 1239 1237 1238 1242 - 902 1146 1139 1144 1138 - 2441 3060 3063 3064 3062 - 2443 3058 3191 3063 3192 - 2440 3061 3058 3060 3063 - 3592 4804 4522 4803 4525 - 963 1200 1211 1230 1229 - 961 1202 1203 1200 1211 - 960 1203 1209 1211 1210 - 3595 4527 4526 4522 4529 - 970 1272 2597 1204 2594 - 969 1271 1272 1208 1204 - 971 1208 1204 1209 1206 - 3594 4801 4527 4804 4522 - 967 1201 1207 1202 1203 - 968 1270 1271 1207 1208 - 966 1275 1270 1201 1207 - 1427 1773 1789 1772 1788 - 143 195 252 196 251 - 188 254 247 252 239 - 142 221 254 195 252 - 1795 2275 2274 2246 2245 - 1718 2120 2537 2519 2535 - 1719 2205 2204 2120 2537 - 1761 2204 2527 2537 2536 - 599 820 776 819 775 - 1760 2167 2547 2527 2546 - 1838 2526 5203 2547 2250 - 1766 2165 2526 2167 2547 - 1794 2230 2246 2242 2244 - 1763 2173 2166 2205 2204 - 1767 2170 2165 2166 2167 - 1762 2172 2170 2173 2166 - 245 322 319 342 337 - 733 1316 923 1338 1310 - 735 924 928 923 927 - 732 1315 924 1316 923 - 3720 4683 4687 4672 4686 - 3716 4666 4667 4665 4671 - 3713 4664 4689 4666 4667 - 3712 4689 4688 4667 4683 - 3244 4079 4166 4174 4173 - 3730 4722 4721 4716 4674 - 3760 4703 4722 4717 4716 - 3714 4717 4716 4664 4689 - 1795 2224 2275 2230 2246 - 3731 4674 4677 4688 4681 - 3729 4720 4719 4677 4676 - 3728 4721 4720 4674 4677 - 244 319 310 337 338 - 1960 2391 2495 2488 2494 - 1984 5265 5264 2495 5188 - 1964 2390 5265 2391 2495 - 3154 4040 4048 4033 4047 - 3591 4802 4520 4801 4527 - 3588 4663 4662 4520 4521 - 3590 4805 4663 4802 4520 - 3232 4084 4174 4077 4172 - 3752 4792 4805 4700 4802 - 3746 4785 4792 4690 4700 - 3744 4690 4700 4697 4699 - 3246 4081 4079 4084 4174 - 3756 4696 4695 4719 4718 - 3755 4699 4801 4695 4804 - 3754 4697 4699 4696 4695 - 246 320 311 319 310 - 3518 4371 4466 5264 4464 - 3507 4375 4458 4466 4465 - 3517 4373 4375 4371 4466 - 3300 4134 4498 4500 4512 - 3764 4709 4702 4703 4722 - 3766 4711 4708 4709 4702 - 3767 4708 4712 4702 4715 - 3399 4500 4512 4499 4261 - 3749 4713 4692 4712 4698 - 3747 4693 4785 4692 4690 - 3748 4691 4693 4713 4692 - 3396 4498 4497 4512 4268 - 3774 4705 4714 4711 4708 - 3751 4788 4691 4714 4713 - 3775 4790 4788 4705 4714 - 1900 2342 2339 2312 2332 - 3127 3981 3977 3957 3956 - 3099 3931 14 3977 3976 - 3098 3933 3931 3981 3977 - 1363 1930 1929 1706 1698 - 3470 4383 4338 4387 4337 - 3469 4334 4333 4338 4336 - 3471 4477 4334 4383 4338 - 1928 2362 2360 2364 2384 - 3467 4482 4488 4333 4332 - 3371 4248 4472 4488 4487 - 3371 4224 4248 4482 4488 - 1930 2416 2415 2360 2382 - 3468 4471 4483 4477 4334 - 3368 4320 4224 4483 4482 - 3417 4275 4320 4471 4483 - 252 317 320 322 319 - 724 929 925 928 921 - 726 932 933 925 919 - 729 931 932 929 925 - 611 790 819 816 822 - 1370 1747 1704 1753 1701 - 1374 1746 1702 1747 1704 - 1373 1702 1941 1704 1703 - 2436 3057 3056 5178 5180 - 1375 1933 1943 1746 1702 - 1179 1491 1493 1943 1942 - 1177 1535 1491 1933 1943 - 1931 2363 2416 2362 2360 - 1180 1942 1492 1941 1930 - 1183 1494 1624 1492 1619 - 1182 1493 1494 1942 1492 - 3903 4859 4856 4858 4857 - 583 773 763 771 765 - 581 786 2590 763 2591 - 580 788 786 773 763 - 2439 3062 3193 3056 3194 - 1176 1536 1496 1535 1491 - 1168 1488 1490 1496 1495 - 1170 1533 1488 1536 1496 - 1372 1941 1930 1703 1706 - 1161 1498 1620 1503 1623 - 1173 1499 1498 1500 1503 - 1172 1500 1503 1490 1489 - 2438 3064 3062 3057 3056 - 1171 1502 1501 1533 1488 - 1152 1478 1499 1501 1500 - 1154 1479 1478 1502 1501 - 3901 4869 4868 4859 4856 - 1967 5194 5266 2390 5265 - 3519 4372 4373 5266 4371 - 3514 5274 4372 5194 5266 - 766 952 954 951 957 - 1098 1415 1411 1414 1413 - 1093 1625 1414 1624 1407 - 1092 1622 1415 1625 1414 - 756 954 948 957 956 - 1103 1425 1426 1411 1410 - 1112 1423 1424 1426 1428 - 1108 1429 1423 1425 1426 - 767 951 957 1303 1302 - 1099 1623 1409 1622 1415 - 1097 1430 1429 1409 1425 - 1096 1620 1430 1623 1409 - 1882 2295 2293 2342 2339 - 3123 3979 3982 3970 3958 - 3103 3932 3933 3982 3981 - 3103 3943 3932 3979 3982 - 1368 1703 1706 1708 1705 - 3373 4227 4225 4318 4226 - 3416 4319 4318 4275 4320 - 3372 4325 4227 4319 4318 - 1335 1667 1746 1665 1747 - 3364 4221 4244 4247 4246 - 3366 4222 4221 4228 4247 - 3375 4228 4247 4225 4245 - 1332 1673 1665 1672 1671 - 3363 4326 4229 4325 4227 - 3362 4223 4222 4229 4228 - 3363 4256 4223 4326 4229 - 3896 4872 4859 4866 4858 - 590 769 767 768 772 - 588 789 788 767 773 - 591 787 789 769 767 - 3156 4037 4033 3993 3994 - 1119 1422 1420 1424 1421 - 1119 1471 1467 1422 1420 - 1118 1467 1466 1420 1473 - 199 261 259 340 345 - 1052 1365 1399 1405 1404 - 1051 1364 1365 1362 1405 - 1051 1362 1405 1466 1403 - 1334 1666 1667 1673 1665 - 1048 1369 1363 1471 1467 - 1055 1370 1364 1363 1362 - 1054 1368 1370 1369 1363 - 1765 5192 5204 2526 5203 - 731 1314 922 1315 924 - 728 930 931 922 929 - 730 1323 930 1314 922 - 610 798 820 790 819 - 1085 1390 1393 1389 1392 - 1087 5248 5247 1393 5186 - 1087 5246 5248 1390 1393 - 198 260 268 261 259 - 1086 5245 5246 1388 1390 - 1081 5244 5245 1402 1388 - 1080 1402 1388 1401 1387 - 196 268 267 259 266 - 1071 1400 1396 1399 1378 - 1083 1387 1389 1396 1395 - 1082 1401 1387 1400 1396 - 3900 4867 4869 4872 4859 - 584 766 770 1298 1299 - 576 772 771 770 762 - 586 768 772 766 770 - 3701 4641 4640 4626 4625 - 1040 1372 1361 1371 1367 - 1036 1356 1371 1368 1370 - 1026 1347 1372 1356 1371 - 3703 4628 4626 4627 4629 - 1046 1358 1360 1361 1359 - 1045 5243 5244 1360 1402 - 1044 5242 5243 1358 1360 - 3700 4639 4641 4628 4626 - 1025 1349 1373 1347 1372 - 1047 5241 5242 1373 1358 - 1024 5240 5241 1349 1373 - 1883 2293 2303 2339 2341 - 3120 3969 3954 3968 3971 - 3122 3958 3957 3954 3965 - 3123 3970 3958 3969 3954 - 3822 4756 4755 4781 4773 - 3380 4231 4243 4472 4473 - 3388 4241 4242 4243 4236 - 3382 4232 4241 4231 4243 - 1072 1392 1385 1391 1384 - 3387 4240 4235 4242 4237 - 3385 4255 4254 4235 4234 - 3384 4253 4255 4240 4235 - 3818 4754 4753 4756 4755 - 3378 4246 4230 4232 4241 - 3379 4252 4253 4230 4240 - 3377 4244 4252 4246 4230 - 3322 5278 4170 5193 5192 - 3573 4438 4433 4437 4431 - 3568 4447 4450 4433 4432 - 3572 4451 4447 4438 4433 - 1074 1395 1391 1394 1398 - 1167 1599 1598 1486 1484 - 1251 1564 1566 1599 1598 - 1255 1566 1565 1598 1600 - 2255 2883 2882 2831 2830 - 1158 1481 1480 1479 1478 - 1157 1487 1486 1480 1485 - 1159 1604 1487 1481 1480 - 3823 4753 4757 4755 4771 - 1156 1606 1597 1604 1487 - 1250 1590 1564 1597 1599 - 1268 1578 1590 1606 1597 - 3323 4170 4143 5192 5204 - 3551 4413 4405 4451 4447 - 3550 4408 4409 4405 4407 - 3551 4414 4408 4413 4405 - 2287 2857 2878 2882 2885 - 1253 1571 1614 1565 1616 - 1033 1613 1355 1614 1351 - 1252 1574 1613 1571 1614 - 2286 2858 2857 2883 2882 - 1031 5236 5240 1350 1349 - 1031 5235 5236 1348 1350 - 1030 1348 1350 1355 1354 - 1084 1389 1392 1395 1391 - 1263 1568 1612 1574 1613 - 1028 5234 5235 1612 1348 - 1263 5233 5234 1568 1612 - 1764 5193 5192 2165 2526 - 3548 4407 4406 4450 4449 - 3547 4404 4403 4406 12 - 3546 4409 4404 4407 4406 - 2463 3112 3111 3072 3071 - 1248 1588 1575 1589 1573 - 1265 1576 1589 1578 1590 - 1264 1585 1588 1576 1589 - 2479 3087 3107 3111 3114 - 1257 1567 1570 1575 1569 - 1262 5232 5233 1570 1568 - 1258 5231 5232 1567 1570 - 2478 3088 3087 3112 3111 - 1275 1580 1587 1585 1588 - 1259 5230 5231 1587 1567 - 1274 5229 5230 1580 1587 - 3786 4726 4730 4746 4745 - 923 1153 1157 1275 1270 - 916 1159 1151 1157 1156 - 922 1155 1159 1153 1157 - 3780 4730 4747 4745 4748 - 3220 4164 4178 4068 4177 - 3291 4123 4125 4178 4128 - 3291 4165 4123 4164 4178 - 1100 1410 1427 1412 1460 - 3293 4126 4124 4125 4129 - 3295 5118 4181 4124 4191 - 3294 5144 5118 4126 4124 - 1091 1413 1412 1406 1454 - 3290 5123 5122 4165 4123 - 3288 5145 5144 5122 4126 - 4027 4994 5145 5123 5122 - 1706 2130 2525 2112 2539 - 912 1161 2604 1149 2600 - 914 1162 1161 1158 1149 - 915 1158 1149 1151 1152 - 1102 1411 1410 1413 1412 - 1431 1896 1774 1898 1770 - 1523 1866 1898 1865 1897 - 1523 1876 1896 1866 1898 - 1160 1485 1482 1499 1498 - 1424 1779 1780 1774 1773 - 1432 1777 1775 1780 1781 - 1438 1776 1777 1779 1780 - 475 671 605 670 654 - 1531 1871 1895 1876 1896 - 1439 1894 1776 1895 1779 - 1529 1873 1894 1871 1895 - 1704 2112 2539 2532 2538 - 3353 4496 4495 4503 4214 - 3166 4002 4496 4509 4503 - 3352 4509 4503 4324 4321 - 1163 1484 1483 1482 1621 - 1435 1786 1782 1775 1783 - 1412 1822 1821 1782 1758 - 1413 1817 1822 1786 1782 - 473 608 607 605 612 - 1455 1799 1794 1821 1820 - 1449 1797 1798 1794 1793 - 1454 1796 1797 1799 1794 - 1166 1486 1484 1485 1482 - 1467 1803 1813 1817 1822 - 1444 1812 1796 1813 1799 - 1466 1805 1812 1803 1813 - 3082 3920 3919 3923 3922 - 3405 4268 4265 4267 4270 - 3407 4508 4324 4265 4266 - 3404 4497 4508 4268 4265 - 1289 1632 1652 1634 1650 - 1461 1892 1819 1893 1816 - 1528 1887 1893 1873 1894 - 1489 1899 1892 1887 1893 - 1291 1630 1647 1652 1651 - 1462 1807 1810 1819 1818 - 1465 1806 1805 1810 1803 - 1464 1808 1806 1807 1810 - 1288 1633 1630 1632 1652 - 1460 1900 1802 1899 1892 - 1463 1801 1808 1802 1807 - 1456 1901 1801 1900 1802 - 2453 3069 3068 5174 5175 - 4012 4977 4981 4992 4980 - 4011 4978 4976 4981 4979 - 4015 4998 4978 4977 4981 - 3282 4128 4127 4119 4122 - 3211 5130 4072 4055 4071 - 3218 5136 5135 4072 4062 - 3210 5140 5136 5130 4072 - 375 497 486 496 495 - 4024 4990 5121 5135 5120 - 4027 4986 4994 5121 5123 - 4026 4988 4986 4990 5121 - 3287 4129 4190 4127 4120 - 4017 5074 4991 5140 5136 - 4019 4982 4988 4991 4990 - 4016 5067 4982 5074 4991 - 1805 2525 2524 2539 2229 - 1677 2584 2090 2086 2089 - 1673 2087 2091 2090 2088 - 1676 2555 2087 2584 2090 - 1606 2073 2522 2009 2523 - 362 519 518 481 474 - 359 471 481 473 480 - 359 517 519 471 481 - 3292 4125 4129 4128 4127 - 325 434 507 469 506 - 356 502 473 507 472 - 324 435 502 434 507 - 56 362 98 361 182 - 323 433 503 435 502 - 358 516 517 503 471 - 323 515 516 433 503 - 91 134 138 132 136 - 1651 2046 2067 3 2066 - 1631 2065 2020 2067 2019 - 1650 2048 2065 2046 2067 - 58 99 102 98 96 - 339 2 450 1 449 - 338 454 459 450 458 - 339 453 454 2 450 - 336 458 464 448 463 - 342 460 462 459 461 - 345 468 469 462 455 - 344 467 468 460 462 - 61 355 99 362 98 - 341 451 452 453 454 - 343 466 467 452 460 - 340 465 466 451 452 - 89 132 136 689 690 - 3139 4188 4510 4497 4508 - 3138 3983 4002 4510 4509 - 3139 3985 3983 4188 4510 - 3393 4311 4282 4313 4285 - 335 442 445 440 447 - 334 439 440 465 466 - 335 441 442 439 440 - 3439 4309 4313 4307 4312 - 320 446 444 445 436 - 322 514 515 444 433 - 329 513 514 446 444 - 3445 4315 4311 4309 4313 - 331 437 438 441 442 - 328 512 513 438 446 - 330 511 512 437 438 - 2455 3074 3081 3068 3079 - 471 603 646 604 649 - 490 651 642 646 622 - 470 617 651 603 646 - 1745 2150 2156 2199 2198 - 3784 5110 4728 5109 5094 - 3787 4729 4732 4728 4726 - 3789 5111 4729 5110 4728 - 88 136 135 690 691 - 3782 4733 4724 4732 4731 - 3779 5163 5172 4724 4723 - 3778 5162 5163 4733 4724 - 1695 2581 2199 2108 2095 - 3788 5112 4727 5111 4729 - 3791 5161 5162 4727 4733 - 3790 5160 5161 5112 4727 - 91 359 134 348 132 - 2500 3121 3120 5184 5182 - 2503 3129 6 3120 3119 - 2502 3130 3129 3121 3120 - 474 669 608 671 605 - 314 416 669 417 671 - 319 415 417 518 510 - 318 419 416 415 417 - 1746 2560 2150 2581 2199 - 479 668 606 669 608 - 478 616 615 606 613 - 479 667 616 668 606 - 1520 1870 1868 1869 1864 - 315 418 414 419 416 - 313 422 667 414 668 - 312 424 422 418 414 - 3606 4532 4538 4531 4539 - 1674 2105 2107 2091 2106 - 1692 2095 2196 2107 2096 - 1694 2108 2095 2105 2107 - 1526 1867 1877 1870 1868 - 462 601 598 615 614 - 449 593 592 598 590 - 448 599 593 601 598 - 1609 2072 2073 2011 2009 - 455 591 648 592 647 - 502 693 694 648 631 - 455 692 693 591 648 - 1522 1877 1876 1868 1866 - 452 600 597 599 593 - 454 691 692 597 591 - 457 690 691 600 597 - 90 348 132 688 689 - 1672 2554 2586 2555 2087 - 1675 2580 2108 2586 2105 - 1122 1431 2580 2554 2586 - 1419 1763 1766 1765 1760 - 292 400 665 399 664 - 295 405 399 424 422 - 294 407 400 405 399 - 1418 1766 2565 1760 2567 - 459 666 594 665 596 - 456 689 690 594 600 - 458 688 689 666 594 - 1416 1765 1760 1764 1759 - 290 408 398 407 400 - 291 687 688 398 666 - 289 686 687 408 398 - 2452 3068 3079 5175 5176 - 464 612 610 653 656 - 466 611 604 610 602 - 472 607 611 612 610 - 2512 3183 3180 3135 3136 - 3884 4870 4873 4846 4845 - 3871 4826 4867 4873 4872 - 3870 4831 4826 4870 4873 - 349 459 461 458 464 - 3866 4827 4839 4867 4869 - 3849 4834 5109 4839 4838 - 3865 4835 4834 4827 4839 - 2543 3153 3177 3180 3185 - 3869 4830 4828 4831 4826 - 3864 4833 4835 4828 4827 - 3868 4829 4833 4830 4828 - 3607 4808 4532 4807 4531 - 1560 1968 2002 1959 2001 - 1587 1998 1997 2002 1982 - 1542 1945 1998 1968 2002 - 3516 4376 4374 4373 4375 - 270 395 394 377 370 - 268 373 377 375 372 - 261 396 395 373 377 - 2542 3154 3153 3183 3180 - 257 363 364 511 512 - 259 368 375 364 374 - 256 367 368 363 364 - 3498 4362 4361 5276 4382 - 263 365 366 367 368 - 260 397 396 366 373 - 262 393 397 365 366 - 3602 4530 4537 4532 4538 - 1567 1963 1964 2075 1960 - 1562 1965 1959 1964 1962 - 1554 1955 1965 1963 1964 - 3510 4368 4370 4374 4369 - 275 378 429 394 428 - 300 426 403 429 402 - 274 380 426 378 429 - 348 461 457 464 456 - 299 401 409 403 404 - 288 685 686 409 408 - 298 684 685 401 409 - 3511 4378 4368 4376 4374 - 279 379 427 380 426 - 296 683 684 427 401 - 279 682 683 379 427 - 3603 4810 4530 4808 4532 - 1552 1966 1969 1955 1965 - 1541 1952 1945 1969 1968 - 1549 1953 1952 1966 1969 - 2303 2869 3214 2881 3215 - 287 387 390 385 392 - 285 384 385 393 397 - 284 386 387 384 385 - 2302 2870 2869 2886 2881 - 276 391 388 390 381 - 278 681 682 388 379 - 281 680 681 391 388 - 2271 2886 2881 2843 2842 - 283 382 383 386 387 - 280 679 680 383 391 - 282 678 679 382 383 - 3077 3922 3942 3916 3943 - 4004 4995 4973 4994 5145 - 4007 4980 4974 4973 4975 - 4006 4992 4980 4995 4973 - 1059 1403 1383 1477 1476 - 3679 4600 4598 4662 4661 - 3677 4602 4643 4598 4642 - 3676 4599 4602 4600 4598 - 3219 4062 4063 4061 4067 - 3666 4595 4652 4643 4650 - 3665 4604 4647 4652 4651 - 3664 4606 4604 4595 4652 - 1116 1473 1477 1458 1465 - 3672 4603 4601 4599 4602 - 3667 4597 4606 4601 4595 - 3670 4596 4597 4603 4601 - 3423 4277 4319 4278 4275 - 1440 1907 1906 1910 1909 - 1403 1727 1736 1906 1729 - 1402 1730 1727 1907 1906 - 1579 1993 1992 1976 1975 - 1558 2553 2552 2577 1956 - 1448 1795 2577 1798 2576 - 1442 1908 2553 1795 2577 - 1050 1466 1403 1473 1477 - 1557 2551 1967 2552 1958 - 1548 1948 1953 1967 1966 - 1550 2558 1948 2551 1967 - 1543 1944 1946 1945 1998 - 1407 1729 1728 1908 2553 - 1404 1732 2558 1728 2551 - 1406 1736 1732 1729 1728 - 3423 4274 4325 4277 4319 - 1624 2063 2017 2020 2024 - 1626 2028 2027 2017 2025 - 1627 2068 2028 2063 2017 - 1538 2222 2221 1946 1996 - 1540 1954 1950 1953 1952 - 1536 1951 2213 1950 1944 - 1546 1947 1951 1954 1950 - 1577 1979 1976 1978 1974 - 1671 2214 2083 2213 2222 - 1670 2085 2102 2083 2101 - 1671 2084 2085 2214 2083 - 1576 1994 1993 1979 1976 - 1547 2211 2212 1947 1951 - 1664 2089 2084 2212 2214 - 1678 2086 2089 2211 2212 - 3422 4284 4274 4276 4277 - 1468 1814 1811 1805 1812 - 1447 1905 1910 1811 1791 - 1446 1913 1905 1814 1811 - 3623 4583 4553 4547 4546 - 1380 1715 2557 1714 2559 - 1382 1737 1714 1736 1732 - 1383 1721 1715 1737 1714 - 3622 4576 4583 4575 4547 - 1379 1713 2570 2557 2569 - 1679 2583 2086 2570 2211 - 1379 1928 2583 1713 2570 - 3616 4575 4547 4577 4545 - 1376 1722 1720 1721 1715 - 1378 1931 1928 1720 1713 - 1386 1716 1931 1722 1720 - 601 780 2589 786 2590 - 476 613 609 607 611 - 469 614 617 609 603 - 468 615 614 613 609 - 1535 1890 1888 1872 1874 - 417 551 561 579 580 - 431 562 558 561 556 - 416 559 562 551 561 - 1488 1835 1899 1888 1887 - 428 557 582 558 587 - 410 657 658 582 539 - 425 655 657 557 582 - 1539 2213 2222 1944 1946 - 427 560 555 559 562 - 424 656 655 555 557 - 426 653 656 560 555 - 66 118 125 116 124 - 3638 4557 4558 5137 5153 - 3639 4567 4566 4558 4565 - 3635 4556 4567 4557 4558 - 3370 4226 4249 4224 4248 - 1607 2005 2529 2004 2528 - 1601 2031 2004 2030 2003 - 1600 2010 2005 2031 2004 - 327 443 434 468 469 - 1623 2015 2016 2012 26 - 1621 2014 2030 2016 2033 - 1619 2023 2014 2015 2016 - 1490 1837 1835 1890 1888 - 1618 2025 2032 2023 2014 - 1605 2026 2010 2032 2031 - 1616 2027 2026 2025 2032 - 1868 2284 2502 2338 2503 - 1392 1739 1723 1913 1905 - 1395 1725 1730 1723 1907 - 1394 1742 1725 1739 1723 - 3433 4294 4290 4293 4486 - 1896 2523 2343 2529 2310 - 1887 2294 2295 2343 2342 - 1887 2522 2294 2523 2343 - 3383 4245 4232 4249 4231 - 1881 2296 2299 2295 2293 - 1875 2297 2302 2299 2304 - 1880 2298 2297 2296 2299 - 3374 4225 4245 4226 4249 - 1886 2521 2545 2522 2294 - 1884 2540 2298 2545 2296 - 1580 1973 2540 2521 2545 - 3412 4276 4277 4271 4278 - 624 985 804 1309 1308 - 626 805 810 804 802 - 629 979 805 985 804 - 1738 2147 2148 2146 2144 - 1608 2071 2072 2006 2011 - 1611 2008 2006 2027 2026 - 1610 2080 2071 2008 2006 - 1740 2148 2143 2144 2145 - 1570 1977 1980 2072 2073 - 1583 1974 1973 1980 2521 - 1582 1978 1974 1977 1980 - 1739 2146 2144 2163 2162 - 1569 2081 1970 2080 2071 - 1571 1972 1978 1970 1977 - 1568 1971 1972 2081 1970 - 604 781 2588 780 2589 - 1388 1939 1938 1717 1718 - 1139 1443 1442 1938 1937 - 1138 1456 1443 1939 1938 - 2520 3141 3147 3137 3145 - 3155 4047 4044 3992 4043 - 3113 3974 3975 4044 3950 - 3112 4048 3974 4047 4044 - 2523 3139 3137 5155 5156 - 3079 3918 3980 3975 3978 - 3078 3916 3943 3980 3979 - 3079 3917 3916 3918 3980 - 3426 4293 4486 4479 4484 - 3072 4049 3915 4048 3974 - 3075 3924 3917 3915 3918 - 3074 4050 3924 4049 3915 - 1918 2344 2338 2325 2340 - 3634 4572 4574 4556 4567 - 3642 4571 4570 4574 4559 - 3619 4544 4571 4572 4574 - 1952 5190 5276 2387 5275 - 1850 2277 2262 2271 2273 - 1811 2245 2271 2232 2233 - 1810 2274 2277 2245 2271 - 1667 2088 2104 2084 2085 - 1851 2259 2261 2262 2264 - 1855 2260 5189 2261 2265 - 1849 2267 2260 2259 2261 - 2522 3142 3141 3139 3137 - 1824 2247 2269 2274 2277 - 1848 2266 2267 2269 2259 - 1827 2248 2266 2247 2269 - 989 1263 1219 1262 1218 - 3563 5249 4422 5191 5273 - 3561 4425 4424 4422 4421 - 3560 4423 4425 5249 4422 - 1604 2011 2009 2010 2005 - 1857 2301 2300 2302 2282 - 1856 2347 2346 2300 2354 - 1876 2348 2347 2301 2300 - 1690 2106 2100 2104 2103 - 1819 2243 2241 2346 2234 - 1809 2244 2232 2241 2240 - 1808 2242 2244 2243 2241 - 1666 2091 2106 2088 2104 - 1800 2349 2226 2348 2347 - 1803 2227 2242 2226 2243 - 1802 2228 2227 2349 2226 - 1871 2286 2284 2344 2338 - 3643 4559 4564 4566 4562 - 3641 4569 4568 4564 4563 - 3640 4570 4569 4559 4564 - 4076 5039 5058 5041 5057 - 1793 2225 2224 2231 2230 - 1804 2229 2231 2228 2227 - 1797 2524 2225 2229 2231 - 4083 5058 5046 5057 5056 - 1792 2270 2276 2224 2275 - 1826 2249 2248 2276 2247 - 1830 2278 2249 2270 2276 - 4071 5041 5057 5036 5034 - 1796 2520 2223 2524 2225 - 1799 2535 2278 2223 2270 - 1798 2519 2535 2520 2223 - 2454 3076 3074 3069 3068 - 1360 1698 1940 1700 1745 - 1137 1927 1456 1940 1939 - 1362 1929 1927 1698 1940 - 3203 4058 4057 4105 4104 - 3860 4836 4832 4829 4833 - 3843 4815 4820 4832 4822 - 3847 4813 4815 4836 4832 - 3432 4290 4289 4486 4485 - 3855 4819 4821 4820 4817 - 3852 5159 5160 4821 5112 - 3842 5158 5159 4819 4821 - 3256 4106 4105 4088 4091 - 3846 4816 4814 4813 4815 - 3840 5157 5158 4814 4819 - 3841 5166 5157 4816 4814 - 991 1265 1220 1263 1219 - 3903 4856 5097 4857 5100 - 3802 5096 4739 5097 4740 - 3902 4868 5096 4856 5097 - 3203 4053 4058 4106 4105 - 1594 2217 2216 1985 1984 - 1593 1988 1985 1994 1993 - 1595 1989 2217 1988 1985 - 1818 2234 2236 2353 2352 - 1574 1991 1990 1971 1972 - 1575 1995 1994 1990 1979 - 1590 1981 1995 1991 1990 - 1035 1351 1357 1352 1353 - 1591 1982 1986 1981 1995 - 1592 1987 1989 1986 1988 - 1587 1997 1987 1982 1986 - 1023 1249 1263 13 1262 - 3867 4838 5095 4868 5096 - 3801 5094 4746 5095 4737 - 3848 5109 5094 4838 5095 - 1032 1355 1354 1351 1357 - 1711 2113 2115 2216 2220 - 1707 2114 2130 2115 2112 - 1709 2117 2114 2113 2115 - 1027 1354 1347 1357 1356 - 1714 2131 2119 2130 2525 - 1715 2121 2519 2119 2520 - 1713 2127 2121 2131 2119 - 4063 5023 5071 5065 5070 - 1708 2116 2129 2117 2114 - 1712 2134 2127 2129 2131 - 1697 2132 2134 2116 2129 - 1022 1250 1265 1249 1263 - 3803 4737 4741 4739 4743 - 3796 4745 4748 4741 4736 - 3800 4746 4745 4737 4741 - 461 665 596 664 595 - 1700 2219 2110 2210 2215 - 1586 1996 2210 1997 1987 - 1584 2221 2219 1996 2210 - 463 596 599 595 601 - 1703 2137 2111 2110 2118 - 1696 2109 2132 2111 2116 - 1698 2136 2109 2137 2111 - 460 664 595 667 616 - 1668 2101 2138 2221 2219 - 1687 2093 2136 2138 2137 - 1686 2102 2093 2101 2138 - 3076 3923 3922 3917 3916 - 4002 4979 5106 4974 4972 - 3719 5093 4665 5106 5105 - 4010 4976 5093 4979 5106 - 224 302 308 326 328 - 3671 4605 4608 4596 4597 - 3651 4586 4592 4608 4607 - 3652 4584 4586 4605 4608 - 1949 2372 2374 2371 2373 - 3656 4593 4594 4592 4590 - 3658 5156 5183 4594 4588 - 3650 5155 5156 4593 4594 - 233 306 301 308 297 - 3655 4587 4585 4584 4586 - 3648 5154 5155 4585 4593 - 3649 5179 5154 4587 4585 - 1879 2533 2292 2291 2290 - 3343 4206 4205 4202 4204 - 3331 4196 16 4205 4193 - 3333 4197 4196 4206 4205 - 232 303 306 302 308 - 172 256 229 255 228 - 147 203 255 221 254 - 146 205 256 203 255 - 4028 4987 4985 4988 4986 - 170 227 231 229 232 - 169 345 344 231 230 - 168 340 345 227 231 - 3496 5252 4362 5190 5276 - 151 204 257 205 256 - 171 341 340 257 227 - 151 338 341 204 257 - 1874 2292 2348 2290 2301 - 3376 4257 4251 4244 4252 - 3339 4198 4202 4251 4201 - 3338 4219 4198 4257 4251 - 1859 2354 2353 2281 2289 - 157 209 210 218 220 - 159 212 215 210 217 - 156 211 212 209 210 - 4031 4989 4993 4987 4985 - 148 216 213 215 206 - 150 337 338 213 204 - 153 342 337 216 213 - 4030 4993 4992 4985 4995 - 155 207 208 211 212 - 152 343 342 208 216 - 154 339 343 207 208 - 1875 2290 2301 2297 2302 - 1820 2352 2235 2351 2501 - 1823 2237 2500 2235 2511 - 1822 2236 2237 2352 2235 - 1936 2412 2499 2367 2498 - 131 193 198 189 197 - 130 188 189 678 679 - 131 192 193 188 189 - 1974 2396 2487 2412 2499 - 134 199 202 198 201 - 141 222 221 202 195 - 140 219 222 199 202 - 1996 2487 2490 2499 2442 - 133 190 191 192 193 - 135 220 219 191 199 - 132 218 220 190 191 - 600 778 780 788 786 - 1095 1619 1408 1929 1927 - 1088 1407 1406 1408 1455 - 1094 1624 1407 1619 1408 - 3329 4506 4511 4197 4196 - 3100 3937 3936 3938 3934 - 3102 3942 3938 3943 3932 - 3089 3941 3937 3942 3938 - 3328 4494 4493 4511 4194 - 3095 5147 5146 3929 3928 - 3095 5152 5147 3930 3929 - 3094 3930 3929 3936 3935 - 1819 2346 2234 2354 2353 - 3088 5133 3926 3941 3937 - 3091 5153 5152 3926 3930 - 3090 5137 5153 5133 3926 - 3343 4202 4204 4201 4200 - 1655 2062 2064 2048 2065 - 1635 2034 2068 2064 2063 - 1634 2060 2034 2062 2064 - 1948 2381 2372 2380 2371 - 3424 5214 4287 5189 5277 - 3430 4286 4288 5214 4287 - 3427 4288 4294 4287 4293 - 3130 3959 4494 4506 4511 - 3438 4308 4307 4304 4292 - 3451 4302 4308 4305 4304 - 3431 4305 4304 4286 4288 - 1251 1573 1572 1564 1566 - 3437 4292 4291 4294 4290 - 3436 4312 4310 4291 4317 - 3439 4307 4312 4292 4291 - 1872 2291 2290 2298 2297 - 2181 2785 2762 5170 5164 - 2183 2771 2770 2762 2764 - 2180 2786 2771 2785 2762 - 137 198 201 197 200 - 3409 4283 4284 4273 4276 - 3411 4285 4273 4310 4272 - 3408 4282 4283 4285 4273 - 1256 1575 1569 1573 1572 - 3361 4322 4256 4327 4326 - 3402 4264 4322 4281 4327 - 3420 4281 4327 4284 4274 - 1260 1569 1574 1572 1571 - 3395 4262 4269 4282 4283 - 3403 4270 4264 4269 4281 - 3394 4267 4270 4262 4269 - 3341 4200 4199 4255 4254 - 3336 4220 4207 4219 4198 - 3332 4195 4197 4207 4206 - 3334 4218 4195 4220 4207 - 519 706 699 724 722 - 3447 4303 4296 4302 4308 - 3446 4301 4297 4303 4296 - 3444 4297 4315 4296 4309 - 518 707 700 706 699 - 3398 4261 4267 4263 4262 - 3399 4499 4261 4316 4263 - 3392 4316 4263 4315 4311 - 516 700 2612 699 2611 - 3443 4514 4295 4301 4297 - 3440 4492 4499 4295 4316 - 3442 4491 4492 4514 4295 - 3718 5092 4666 5093 4665 - 1136 1455 1464 1456 1443 - 1143 1454 1461 1464 1441 - 1090 1406 1454 1455 1464 - 1888 2317 2335 2306 2334 - 364 478 475 500 498 - 366 479 476 478 475 - 367 476 675 475 677 - 136 201 196 200 194 - 423 654 653 552 560 - 422 670 654 663 552 - 420 663 552 675 554 - 1891 2307 2306 8 2305 - 363 474 477 479 476 - 361 510 670 477 663 - 360 518 510 474 477 - 64 358 116 357 143 - 2184 2787 2765 2786 2771 - 2186 2768 2767 2765 2772 - 2187 2788 2768 2787 2765 - 1890 2314 2317 2307 2306 - 3360 4250 4260 4256 4223 - 3359 4213 4212 4250 4260 - 3358 4212 4215 4260 4259 - 1503 1920 1839 1840 1841 - 3348 4210 4218 4211 4220 - 3346 4208 4210 4216 4211 - 3350 4216 4211 4215 4209 - 128 197 200 680 681 - 3355 4214 4217 4213 4212 - 3347 4516 4208 4217 4216 - 3354 4495 4516 4214 4217 - 3342 4204 4203 4200 4199 - 2176 2772 2769 2770 2761 - 2188 2766 2893 2769 2888 - 2190 2767 2766 2772 2769 - 1501 1904 1903 1920 1839 - 3335 4490 4507 4218 4195 - 3128 3972 3959 4507 4506 - 3109 3946 3972 4490 4507 - 1500 1903 1911 1839 1902 - 3131 3961 3963 3959 4494 - 3135 3960 15 3963 3962 - 3133 3964 3960 3961 3963 - 332 447 443 467 468 - 3108 3945 3973 3946 3972 - 3132 3971 3964 3973 3961 - 3110 3968 3971 3945 3973 - 3340 4201 4200 4253 4255 - 628 980 803 979 805 - 631 814 815 803 811 - 630 990 814 980 803 - 2204 2779 2778 5169 5171 - 3106 4042 3944 4518 4517 - 3167 3997 4518 4495 4516 - 3163 3995 4042 3997 4518 - 2206 2781 2783 2779 2778 - 3107 3952 3951 3944 3947 - 3111 3953 3968 3951 3945 - 3114 3949 3953 3952 3951 - 2207 2783 2786 2778 2785 - 3162 4000 4041 3995 4042 - 3115 4043 3949 4041 3952 - 3153 3992 4043 4000 4041 - 602 783 781 778 780 - 3988 4959 4964 4966 4965 - 3986 4957 4959 5076 4966 - 3992 5076 4966 5075 4962 - 1555 1958 1955 1957 1963 - 374 495 484 494 483 - 372 485 672 484 674 - 375 486 485 495 484 - 1559 1956 1957 2074 2076 - 447 662 570 672 571 - 444 578 579 570 577 - 446 676 578 662 570 - 3270 4111 4110 4158 4157 - 371 492 482 486 485 - 368 498 676 482 662 - 370 500 498 492 482 - 1414 1821 1820 1758 1815 - 988 1219 1223 1218 1222 - 976 1225 1224 1223 1213 - 990 1220 1225 1219 1223 - 3888 4863 4852 5083 5085 - 3137 4180 3985 4189 4188 - 3303 4161 4189 4134 4498 - 3302 4155 4180 4161 4189 - 1063 1381 1379 1375 2573 - 3298 4139 4133 4491 4492 - 3299 4135 4134 4133 4500 - 3297 4138 4135 4139 4133 - 1556 2552 1958 1956 1957 - 3296 4159 4162 4138 4135 - 3276 4114 4155 4162 4161 - 3279 4118 4114 4159 4162 - 1408 1758 1815 1757 1756 - 986 1221 1217 1220 1225 - 984 1229 1228 1217 1226 - 987 1230 1229 1221 1217 - 3890 4864 4860 4863 4852 - 3136 3986 3984 3985 3983 - 3140 4005 4004 3984 4003 - 3142 3990 4005 3986 3984 - 3267 4109 4116 4110 4117 - 3160 3999 4001 4004 3996 - 3152 3994 3992 4001 4000 - 3158 3993 3994 3999 4001 - 3891 4860 4854 4852 4855 - 3149 3989 4006 3990 4005 - 3159 4007 3993 4006 3999 - 3148 3987 4007 3989 4006 - 1452 1820 1792 1815 2562 - 983 1226 1214 1224 1216 - 981 1227 2618 1214 2617 - 980 1228 1227 1226 1214 - 3097 3936 3935 3934 3940 - 3275 4113 4185 4115 4192 - 3278 4117 4115 4118 4114 - 3275 4116 4113 4117 4115 - 3096 3934 3940 3933 3931 - 3147 4186 3988 4185 3991 - 3151 4034 3987 3988 3989 - 3147 4039 4034 4186 3988 - 3092 3935 3927 3940 3939 - 3273 4132 4187 4116 4113 - 3146 4183 4039 4187 4186 - 3284 4121 4183 4132 4187 - 3080 3919 3941 3922 3942 - 4001 4975 5119 5144 5118 - 4003 4972 5104 5119 5124 - 4000 4974 4972 4975 5119 - 3241 4102 4100 4096 4099 - 3696 4638 4639 4635 4628 - 3708 4632 4635 4636 4634 - 3706 4630 4638 4632 4635 - 3225 4066 4065 4100 4163 - 3690 4621 4620 4616 4615 - 3691 4618 4621 4619 4616 - 3688 4619 4616 4639 4641 - 1950 2379 2385 2381 2372 - 3707 4651 4637 4630 4638 - 3695 4617 4618 4637 4619 - 3694 4647 4617 4651 4637 - 1652 2049 2048 2047 2046 - 1058 1476 1374 1475 1470 - 1056 1382 1381 1374 1375 - 1059 1383 1382 1476 1374 - 3227 4073 4066 4102 4100 - 60 356 97 355 99 - 62 349 104 356 97 - 63 104 105 97 103 - 352 480 479 470 478 - 204 266 263 344 346 - 207 264 355 263 362 - 206 267 264 266 263 - 1699 2135 2139 2136 2109 - 203 262 265 267 264 - 201 288 349 265 356 - 200 286 288 262 265 - 1651 2047 2046 2045 3 - 243 309 329 311 330 - 192 335 289 329 258 - 242 327 335 309 329 - 84 138 130 136 135 - 38 91 87 105 106 - 39 88 79 87 78 - 44 89 88 91 87 - 354 473 480 472 470 - 35 77 111 79 110 - 2 109 44 111 45 - 35 76 109 77 111 - 355 472 470 499 501 - 32 90 85 89 88 - 34 86 76 85 77 - 41 80 86 90 85 - 1649 2057 2047 2056 2045 - 1066 1376 1380 1381 1379 - 1065 1398 1397 1380 2549 - 1064 1394 1398 1376 1380 - 3705 4650 4630 4633 4632 - 220 278 351 277 350 - 222 281 277 286 288 - 223 279 278 281 277 - 3704 4643 4650 4642 4633 - 43 352 82 351 84 - 40 81 80 82 90 - 42 347 81 352 82 - 3710 4642 4633 4644 4631 - 219 284 276 279 278 - 216 275 347 276 352 - 218 280 275 284 276 - 4009 4996 5098 5004 5092 - 4032 5090 5087 5008 5068 - 3912 4878 5078 5087 5086 - 3914 4882 4878 5090 5087 - 3266 4112 4109 4111 4110 - 1314 2593 1656 2601 1654 - 1313 1663 1662 1656 1655 - 1312 2592 1663 2593 1656 - 3274 4192 4179 4155 4180 - 1324 1658 1659 1662 1661 - 1327 1926 1925 1659 1675 - 1321 1924 1926 1658 1659 - 3143 3991 3990 4179 3986 - 1323 2591 1657 2592 1663 - 1320 1923 1924 1657 1658 - 1322 2590 1923 2591 1657 - 1648 2054 2049 2057 2047 - 1816 2240 2239 2236 2237 - 1815 2233 2272 2239 2238 - 1812 2232 2233 2240 2239 - 1620 2033 2544 2013 2542 - 1683 2098 2092 2094 2135 - 1685 2103 2094 2102 2093 - 1684 2100 2098 2103 2094 - 3344 4501 4490 4210 4218 - 1681 2097 2197 2092 2140 - 1787 2202 2178 2197 2200 - 1680 2208 2202 2097 2197 - 3144 4185 3991 4192 4179 - 1691 2096 2099 2100 2098 - 1689 2209 2208 2099 2097 - 1688 2196 2209 2096 2099 - 1632 2042 2036 2061 2060 - 1068 1404 1377 1383 1382 - 1067 1378 1394 1377 1376 - 1070 1399 1378 1404 1377 - 1603 2003 2530 2544 2543 - 1790 2180 2179 2178 2177 - 1789 2183 2182 2179 2186 - 1791 2191 2183 2180 2179 - 1602 2030 2003 2033 2544 - 1780 2176 2174 2182 2188 - 1783 5196 5199 2174 5198 - 1782 2175 5196 2176 2174 - 1727 2140 2123 2139 2133 - 1788 2190 2181 2191 2183 - 1779 2194 2175 2181 2176 - 1778 2189 2194 2190 2181 - 1640 2038 2042 2059 2061 - 1338 1677 1674 1678 1668 - 1331 1664 1666 1674 1673 - 1331 1676 1664 1677 1674 - 3660 4590 4591 4589 4648 - 1751 2159 2158 2152 2151 - 1748 2198 2152 2196 2209 - 1744 2156 2159 2198 2152 - 3663 4607 4589 4606 4604 - 1736 2162 2189 2164 2190 - 1758 2163 2162 2154 2164 - 1759 2154 2164 2158 2192 - 3662 4592 4590 4607 4589 - 1753 2157 2155 2156 2159 - 1756 2161 2163 2155 2154 - 1752 2153 2161 2157 2155 - 3715 5098 4664 5092 4666 - 3985 5086 4960 4957 4959 - 3990 5079 5081 4960 4958 - 3984 5078 5079 5086 4960 - 692 888 893 894 897 - 1197 1514 1513 1510 1509 - 1196 2589 1510 2590 1923 - 1199 2588 1514 2589 1510 - 697 887 886 893 883 - 1191 1507 1505 1506 1525 - 1187 1607 1507 1504 1506 - 1184 1504 1506 1513 1511 - 467 602 650 655 657 - 1199 2587 1512 2588 1514 - 1186 1608 1607 1512 1504 - 1198 2608 1608 2587 1512 - 1642 2040 2044 2038 2042 - 1330 1675 1936 1676 1664 - 1214 1922 1520 1936 1935 - 1326 1925 1922 1675 1936 - 1682 2092 2140 2135 2139 - 3247 4093 4081 4083 4084 - 3239 4095 4083 5199 4078 - 3236 4098 4093 4095 4083 - 696 890 887 888 893 - 3243 4082 4080 4081 4079 - 3240 4099 4168 4080 4167 - 3243 4096 4099 4082 4080 - 1195 1509 1508 1924 1926 - 3248 4087 4094 4098 4093 - 3242 4097 4096 4094 4082 - 3254 4086 4097 4087 4094 - 3461 4335 4356 4329 4357 - 1814 2238 2509 2500 2507 - 1840 2279 2504 2509 2508 - 1815 2272 2279 2238 2509 - 1194 1513 1511 1509 1508 - 3271 4163 4169 4168 4156 - 3268 4175 4112 4169 4111 - 3229 4065 4175 4163 4169 - 1192 1511 1526 1508 1527 - 3264 4130 4131 4112 4109 - 3281 4122 4121 4131 4132 - 3280 4119 4122 4130 4131 - 1951 2385 2375 2372 2374 - 3228 4064 4176 4065 4175 - 3283 4177 4119 4176 4130 - 3230 4068 4177 4064 4176 - 1636 2044 2043 2042 2036 - 1211 1518 1535 1934 1933 - 1210 1520 1518 1935 1934 - 1328 1935 1934 1666 1667 - 3674 4795 4599 4794 4600 - 3215 4074 4073 4103 4102 - 3255 4104 4103 4086 4097 - 3215 4057 4074 4104 4103 - 3675 4806 4794 4805 4663 - 3231 4067 4068 4069 4064 - 3217 4061 4067 4070 4069 - 3224 4070 4069 4073 4066 - 3834 4763 4795 4806 4794 - 3214 4056 4075 4057 4074 - 3216 4071 4061 4075 4070 - 3212 4055 4071 4056 4075 - 4008 5004 5092 4976 5093 - 4039 5009 5077 5028 5029 - 3987 5068 4957 5077 5076 - 4038 5008 5068 5009 5077 - 635 806 818 1296 1297 - 1215 1527 1524 1925 1922 - 1212 1522 1523 1524 1521 - 1206 1526 1522 1527 1524 - 37 79 110 78 108 - 1155 1531 1479 1530 1502 - 1205 1517 1530 1523 1534 - 1200 1516 1531 1517 1530 - 587 823 768 818 766 - 1207 1525 1528 1526 1522 - 1204 1529 1516 1528 1517 - 1190 1505 1529 1525 1528 - 3460 4336 4335 4328 4329 - 4091 5049 5050 5052 5047 - 4093 5069 5067 5050 5074 - 4092 5048 5069 5049 5050 - 1987 5188 5187 2424 2420 - 1755 2572 2160 2153 2161 - 1735 2141 2147 2160 2146 - 1734 2550 2141 2572 2160 - 0 110 47 108 48 - 1731 2142 2149 2147 2148 - 1728 5271 5270 2149 5200 - 1731 2556 5271 2142 2149 - 632 808 823 806 818 - 1732 2549 2568 2550 2141 - 1730 2548 2556 2568 2142 - 1078 1397 2548 2549 2568 - 3464 4332 4355 4335 4356 - 4084 5045 5054 5046 5053 - 4095 5051 5048 5054 5049 - 4087 5063 5051 5045 5054 - 36 78 108 107 112 - 3260 4108 4107 5270 4089 - 3204 4052 4053 4107 4106 - 3206 4101 4052 4108 4107 - 3528 5264 4464 5188 5187 - 3202 4054 4059 4053 4058 - 3208 4060 4055 4059 4056 - 3201 5149 4060 4054 4059 - 3530 4464 4393 5187 5225 - 3207 5117 5116 4101 4052 - 3200 5150 5149 5116 4054 - 4069 5055 5150 5117 5116 - 3466 4333 4332 4336 4335 - 4082 5056 5044 5055 5150 - 4080 5053 5052 5044 5151 - 4083 5046 5053 5056 5044 - 523 756 703 755 701 - 1075 1384 1386 1397 2548 - 1079 5259 5258 1386 5202 - 1077 1385 5259 1384 1386 - 557 733 756 732 755 - 4064 5035 5115 5258 5114 - 4068 5034 5055 5115 5117 - 4070 5036 5034 5035 5115 - 556 732 755 751 758 - 1076 5186 5260 1385 5259 - 4067 5042 5036 5260 5035 - 4066 5247 5042 5186 5260 diff --git a/3rdParty/gslib.github/examples/simple_tests/gs_allreduce.c b/3rdParty/gslib.github/examples/simple_tests/gs_allreduce.c deleted file mode 100644 index c61d436bd..000000000 --- a/3rdParty/gslib.github/examples/simple_tests/gs_allreduce.c +++ /dev/null @@ -1,82 +0,0 @@ -#include -#include -#include -#include -#include -#include "../../src/c99.h" -#include "../../src/name.h" -#include "../../src/fail.h" -#include "../../src/types.h" -#include "../../src/mem.h" -#include "../../src/comm.h" -#include "../../src/gs_defs.h" -#include "../../src/gs.h" - -typedef double T; -const gs_dom dom = gs_double; - -int main(int narg, char *arg[]) -{ - comm_ext world; int np; - double t1,t2,gs_time,mpi_time; - struct gs_data *gsh; - struct comm comm; - double *localData,*recvBuf; - slong *glo_num; - int i,j,nid,nsamples; - T *v; - -#ifdef MPI - MPI_Init(&narg,&arg); - world = MPI_COMM_WORLD; - MPI_Comm_size(world,&np); - -#else - world=0, np=1; -#endif - - comm_init(&comm,world); - MPI_Comm_rank(world,&nid); - - glo_num = malloc(sizeof(slong)*1); - glo_num[0] = 1; - - gsh = gs_setup(glo_num,1,&comm,0,gs_auto,1); - - nsamples = 500000; - localData = malloc(sizeof(int)*1); - recvBuf = malloc(sizeof(int)*1); - - MPI_Barrier(world); - t1 = MPI_Wtime(); - for(j=0;j -#include -#include -#include -#include -#include "../../src/c99.h" -#include "../../src/name.h" -#include "../../src/fail.h" -#include "../../src/types.h" -#include "../../src/comm.h" -#include "../../src/mem.h" -#include "../../src/gs_defs.h" -#include "../../src/gs.h" - -typedef double T; -const gs_dom dom = gs_double; - -int main(int narg, char *arg[]) -{ - comm_ext world; int np; - double t1,t2,gs_time,mpi_time; - struct gs_data *gsh; - struct comm comm; - int *localData,*recvBuf; - slong *glo_num; - int i,j,nid,nsamples; - T *v; - -#ifdef MPI - MPI_Init(&narg,&arg); - world = MPI_COMM_WORLD; - MPI_Comm_size(world,&np); - -#else - world=0, np=1; -#endif - - comm_init(&comm,world); - MPI_Comm_rank(world,&nid); - - glo_num = malloc(sizeof(slong)*np); - - for(i=1;i<=np;i++){ - j = nid+1; - if(j>=i){ - glo_num[i-1] = (i-1)*np-i*(i-1)/2 + j-1; - } else { - glo_num[i-1] = (j-1)*np - j*(j-1)/2+i-1; - } - if(j==i){ - glo_num[i-1] = 0; - } - } - - gsh = gs_setup(glo_num,np,&comm,0,gs_auto,1); - - nsamples = 10000; - localData = malloc(sizeof(int)*np); - recvBuf = malloc(sizeof(int)*np); - - MPI_Barrier(world); - t1 = MPI_Wtime(); - for(j=0;j -#include -#include -#include -#include -#include "../../src/c99.h" -#include "../../src/name.h" -#include "../../src/fail.h" -#include "../../src/types.h" -#include "../../src/comm.h" -#include "../../src/mem.h" -#include "../../src/gs_defs.h" -#include "../../src/gs.h" - -typedef double T; -const gs_dom dom = gs_double; - -static void test(const struct comm *comm) -{ - struct gs_data *gsh; - const uint np = comm->np; - slong *id = tmalloc(slong,np+4); - T *v = tmalloc(T,np+4); - uint i; - id[0] = -(slong)(np+10+3*comm->id); - for(i=0;iid+1; - id[np+2] = comm->id+1; - id[np+3] = np-comm->id; - gsh = gs_setup(id,np+4,comm,0,gs_auto,1); - free(id); - - for(i=0;iid==0) for(i=0;iid==0) printf("\n"); - for(i=0;iid==0) for(i=0;i makefile.cdep; - -odepinfo: deps objects; @./odep_info.py *.o - --include makefile.cdep - -%.o: %.c ; @echo CC $<; $(CCCMD) -c $< -o $@ -%.s: %.c ; @echo CC -S $<; $(CCCMD) -S $< -o $@ -objects: $(OBJECTS) ; - -poly_imp.h: gen_poly_imp.c - $(RM) poly_imp.h; - $(CC) -lgmp -lm gen_poly_imp.c -o gen_poly_imp; - ./gen_poly_imp > poly_imp.h; - $(RM) gen_poly_imp - -GS_OBJECTS=gs.o sort.o sarray_transfer.o sarray_sort.o \ - gs_local.o fail.o crystal.o comm.o tensor.o - -XXT=sparse_cholesky.o xxt.o -AMG=amg.o - -sort_test: sort.o fail.o comm.o tensor.o gs_local.o sort_test.o ; @echo LINK $@; $(LINKCMD) $^ -o $@ -sort_test2: sort.o fail.o comm.o tensor.o gs_local.o sort_test2.o ; @echo LINK $@; $(LINKCMD) $^ -o $@ -sarray_sort_test: sort.o fail.o comm.o tensor.o gs_local.o sarray_sort.o sarray_sort_test.o ; @echo LINK $@; $(LINKCMD) $^ -o $@ -spchol_test: sparse_cholesky.o sort.o fail.o comm.o tensor.o gs_local.o spchol_test.o ; @echo LINK $@; $(LINKCMD) $^ -o $@ -comm_test: fail.o comm.o tensor.o gs_local.o comm_test.o ; @echo LINK $@; $(LINKCMD) $^ -o $@ -crystal_test: fail.o crystal.o comm.o tensor.o gs_local.o crystal_test.o ; @echo LINK $@; $(LINKCMD) $^ -o $@ -sarray_transfer_test: sarray_transfer.o sarray_sort.o sort.o fail.o crystal.o comm.o tensor.o gs_local.o sarray_transfer_test.o ; @echo LINK $@; $(LINKCMD) $^ -o $@ - -gs_test: gs_test.o $(GS_OBJECTS); @echo LINK $@; $(LINKCMD) $^ -o $@ -gs_test_old: gs_test_old.o $(GS_OBJECTS); @echo LINK $@; $(LINKCMD) $^ -o $@ -gslib_test: gslib_test.o $(GS_OBJECTS); @echo LINK $@; $(LINKCMD) $^ -lm -o $@ -gs_alltoall: gs_alltoall.o $(GS_OBJECTS); @echo LINK $@; $(LINKCMD) $^ -o $@ -gs_unique_test: gs_unique_test.o $(GS_OBJECTS); @echo LINK $@; $(LINKCMD) $^ -o $@ -xxt_test: xxt_test.o $(CRS) $(GS_OBJECTS); @echo LINK $@; $(LINKCMD) $^ -o $@ -xxt_test2: xxt_test2.o $(CRS) $(GS_OBJECTS); @echo LINK $@; $(LINKCMD) $^ -o $@ -crs_test: crs_test.o $(CRS) $(GS_OBJECTS); @echo LINK $@; $(LINKCMD) $^ -o $@ - -poly_test2: poly.o fail.o comm.o tensor.o gs_local.o poly_test2.o ; @echo LINK $@; $(LINKCMD) $^ -o $@ -poly_test: poly.o fail.o comm.o tensor.o gs_local.o poly_test.o ; @echo LINK $@; $(LINKCMD) $^ -o $@ -lob_bnd_test: tensor.o poly.o lob_bnd.o fail.o comm.o gs_local.o lob_bnd_test.o ; @echo LINK $@; $(LINKCMD) $^ -o $@ -obbox_test: rand_elt_test.o poly.o obbox.o tensor.o lob_bnd.o fail.o comm.o gs_local.o obbox_test.o ; @echo LINK $@; $(LINKCMD) $^ -o $@ -findpts_el_2_test2: tensor.o rand_elt_test.o lob_bnd.o fail.o comm.o gs_local.o poly.o findpts_el_2.o findpts_el_2_test2.o ; @echo LINK $@; $(LINKCMD) $^ -o $@ -findpts_el_2_test: poly.o fail.o comm.o tensor.o gs_local.o findpts_el_2.o findpts_el_2_test.o ; @echo LINK $@; $(LINKCMD) $^ -o $@ -findpts_el_3_test2: tensor.o rand_elt_test.o lob_bnd.o fail.o comm.o gs_local.o poly.o findpts_el_3.o findpts_el_3_test2.o ; @echo LINK $@; $(LINKCMD) $^ -o $@ -findpts_el_3_test: poly.o fail.o comm.o tensor.o gs_local.o findpts_el_3.o findpts_el_3_test.o ; @echo LINK $@; $(LINKCMD) $^ -o $@ -findpts_local_test: rand_elt_test.o lob_bnd.o fail.o comm.o tensor.o gs_local.o poly.o findpts_local.o sort.o sarray_sort.o obbox.o findpts_el_3.o findpts_el_2.o findpts_local_test.o ; @echo LINK $@; $(LINKCMD) $^ -o $@ -findpts_test: sarray_transfer.o sort.o rand_elt_test.o lob_bnd.o poly.o findpts.o sarray_sort.o findpts_local.o obbox.o tensor.o findpts_el_3.o findpts_el_2.o fail.o crystal.o comm.o gs_local.o findpts_test.o ; @echo LINK $@; $(LINKCMD) $^ -o $@ - diff --git a/3rdParty/gslib.github/src/amg.c b/3rdParty/gslib.github/src/amg.c deleted file mode 100644 index 90651f1e1..000000000 --- a/3rdParty/gslib.github/src/amg.c +++ /dev/null @@ -1,1221 +0,0 @@ -#include -#include -#include -#include -#include -#include "c99.h" -#include "name.h" -#include "types.h" -#include "fail.h" -#include "mem.h" -#include "sort.h" -#include "sarray_sort.h" -#include "gs_defs.h" -#include "comm.h" -#include "crystal.h" -#include "sarray_transfer.h" -#include "gs.h" - -#define crs_setup PREFIXED_NAME(crs_setup) -#define crs_solve PREFIXED_NAME(crs_solve) -#define crs_stats PREFIXED_NAME(crs_stats) -#define crs_free PREFIXED_NAME(crs_free ) - -#ifndef AMG_BLOCK_ROWS -# define AMG_BLOCK_ROWS 1200 -#endif - -static double get_time(void) -{ -#ifdef GS_TIMING - return comm_time(); -#else - return 0; -#endif -} - -static void barrier(const struct comm *c) -{ -#ifdef GS_BARRIER - comm_barrier(c); -#endif -} - -/* sparse matrix, condensed sparse row */ -struct csr_mat { - uint rn, cn, *row_off, *col; - double *a; -}; - -/* z = alpha y + beta M x */ -static double apply_M( - double *z, const double alpha, const double *y, - const double beta, const struct csr_mat *const M, const double *x) -{ - uint i; const uint rn=M->rn; - const uint *const row_off = M->row_off, *col = M->col; - const double *a = M->a; - const double t0 = get_time(); - for(i=0;irn,cn=M->cn; - const uint *const row_off = M->row_off, *col = M->col; - const double *a = M->a; - const double t0 = get_time(); - for(i=0;inloc*sizeof(double)); - barrier(comm); t0=get_time(); - gs(ve,gs_double,gs_add,0,Q->gsh,0); - return get_time()-t0; -} - -/* z := alpha y + beta Q^t x - (x := Q Q^t x as a side effect) - */ -static double apply_Qt( - double *z, const double alpha, const double *y, - const double beta, const struct Q *Q, double *x, - const struct comm *comm) -{ - double t0,t1; - uint i; const uint nloc=Q->nloc; - barrier(comm); t0=get_time(); - gs(x,gs_double,gs_add,1,Q->gsh,0); - t1 = get_time() - t0; - for(i=0;ilevels; - const uint *const off = amg->lvl_offset; - double *const b = amg->b, *const x = amg->x; - double *c = amg->c, *c_old = amg->c_old, *r = amg->r; - double *timing = amg->timing; - /* restrict down all levels */ - for(lvl=0;lvlbuf, &amg->W[lvl],b_l); - timing[0]+=apply_Qt(b_lp1, 1,b_lp1, 1,&amg->Q_W[lvl],amg->buf, &amg->comm); - } - /* solve bottom equation (1 dof) */ - { const uint i=off[levels-1]; - if(off[levels]-i) x[i]=amg->Dff[i]*b[i]; } - for(lvl=levels-1;lvl--;) { - double *const b_l = b+off[lvl]; - double *const x_l = x+off[lvl], *const x_lp1 = x+off[lvl+1]; - const double *const d_l = amg->Dff+off[lvl]; - const uint n = off[lvl+1]-off[lvl]; uint i; - const unsigned m = amg->cheb_m[lvl]; unsigned ci; - double alpha, beta, gamma; - timing = amg->timing + lvl*6; - /* buf = Q x_{l+1} */ - timing[2]+=apply_Q(amg->buf, &amg->Q_AfP[lvl],x_lp1, &amg->comm); - /* x_l = W x_{l+1} */ - timing[3]+=apply_M(x_l, 0,b_l, 1,&amg->W [lvl], amg->buf); - /* b_l -= AfP x_{l+1} */ - timing[3]+=apply_M(b_l, 1,b_l, -1,&amg->AfP[lvl], amg->buf); - /* c_1 = Dff b_l */ - for(i=0;i1) { - alpha = amg->cheb_rho[lvl]/2, alpha*=alpha; - gamma = 2*alpha/(1-2*alpha), beta = 1 + gamma; - /* r_1 = b_l - Aff c_1 */ - timing[4]+=apply_Q(amg->buf, &amg->Q_Aff[lvl],c, &amg->comm); - timing[5]+=apply_M(r, 1,b_l, -1,&amg->Aff[lvl],amg->buf); - /* c_2 = (1+gamma)(c_1 + Dff r_1) */ - { double *const temp = c; c=c_old,c_old=temp; } - for(i=0;ibuf, &amg->Q_Aff[lvl],c, &amg->comm); - timing[5]+=apply_M(r, 1,b_l, -1,&amg->Aff[lvl],amg->buf); - /* c_{i+1} = (1+gamma)*(c_i+D r_i) - gamma c_{i-1} */ - { double *const temp = c; c=c_old,c_old=temp; } - for(i=0;itiming_n++; -} - -void crs_solve(double *x, struct crs_data *data, double *b) -{ - uint i; const uint un = data->un; const uint *const umap = data->umap; - double *const ub = data->b, *const ux = data->x; - - gs(b, gs_double,gs_add, 1, data->gs_top, 0); - for(i=0;inull_space) { - const double avg = data->tni*comm_reduce_double(&data->comm, gs_add, ux,un); - for(i=0;igs_top, 0); -} - -void crs_stats(const struct crs_data *const data) -{ - const unsigned lm1 = data->levels-1; - double *avg = tmalloc(double, 2*6*lm1); - double ni = 1/((double)data->timing_n * data->comm.np); - uint i; - for(i=0;i<6*lm1;++i) avg[i] = ni*data->timing[i]; - comm_allreduce(&data->comm,gs_double,gs_add, avg,6*lm1, avg+6*lm1); - if(data->comm.id==0) { - double *t = avg; unsigned lvl; - printf("AMG stats:\n"); - for(lvl=0;lvln=count; - if(rid_map==0) { - map=tmalloc(uint,count); - for(count=0,i=0;iptr; - for(count=0,i=0;ii; - while(uid[i]i=id_perm[i]; - } -} - -static void find_mat_offs(uint *const off, const int levels, - const struct gnz *const p, const uint nz, - const struct id_data *const id) -{ - int lvl=-1; uint i; for(i=0;iptr; ulong last_id = -(ulong)1; - const uint ne = ids->n; - uint k,i=0,ie=nloc; - sarray_sort(struct gnz,p,nz, j,1, buf); /* sort by col */ - for(k=0;kn+1); - id[ids->n] = -(slong)j_id, p[k].j = ids->n++; - } - sarray_sort_2(struct gnz,p,nz, i,1, j,1, buf); - - mat->cn = ids->n; - compress_mat(mat->row_off,mat->rn, mat->col,mat->a, p,nz); -} - -static uint amg_setup_mats( - struct crs_data *const data, - const ulong *const uid, const uint uid_n, - const uint *const id_perm, - const struct id_data *const id, - struct array *mat, - buffer *buf) -{ - const uint *const off = data->lvl_offset; - struct array ide = null_array; uint max_e=0; - const unsigned levels=data->levels; - unsigned lvl, m; - uint *mat_off[3]; struct csr_mat *csr_mat[3]; - data->Q_W = tmalloc(struct Q, (levels-1)*3); - data->Q_AfP = data->Q_W + (levels-1); - data->Q_Aff = data->Q_AfP + (levels-1); - csr_mat[0] = data->W = tmalloc(struct csr_mat, (levels-1)*3); - csr_mat[1] = data->AfP = data->W + (levels-1); - csr_mat[2] = data->Aff = data->AfP + (levels-1); - mat_off[0] = tmalloc(uint, levels*3); - mat_off[1] = mat_off[0]+levels; - mat_off[2] = mat_off[1]+levels; - for(m=0;m<3;++m) { - /* change row from uid to local index */ - localize_rows(mat[m].ptr,mat[m].n, uid,id_perm); - /* sort by row */ - sarray_sort(struct gnz,mat[m].ptr,mat[m].n, i,1, buf); - /* find offsets of each level */ - find_mat_offs(mat_off[m],levels, mat[m].ptr,mat[m].n, id); - } - /* allocate CSR arrays */ - if(levels>1) { - uint *ui = tmalloc(uint, 3*((off[levels-1]-off[0])+(levels-1)) - +mat[0].n+mat[1].n+mat[2].n); - double *a = tmalloc(double, mat[0].n+mat[1].n+mat[2].n); - for(m=0;m<3;++m) for(lvl=0;lvlQ_W[lvl].nloc = data->Q_AfP[lvl].nloc = ide.n = nloc; - amg_setup_mat(&ide,nloc, &data->W[lvl], - uid, uid_n, id_perm, off[lvl],j0, - (struct gnz*)mat[0].ptr + mat_off[0][lvl], - mat_off[0][lvl+1]-mat_off[0][lvl], buf); - data->Q_W[lvl].gsh = gs_setup(ide.ptr,ide.n, &data->comm, 0,gs_auto,1); - amg_setup_mat(&ide,nloc, &data->AfP[lvl], - uid, uid_n, id_perm, off[lvl],j0, - (struct gnz*)mat[1].ptr + mat_off[1][lvl], - mat_off[1][lvl+1]-mat_off[1][lvl], buf); - data->Q_AfP[lvl].gsh = gs_setup(ide.ptr,ide.n, &data->comm, 0,gs_auto,1); - if(ide.n>max_e) max_e=ide.n; - } - for(lvl=0;lvlQ_Aff[lvl].nloc = ide.n = nloc; - amg_setup_mat(&ide,nloc, &data->Aff[lvl], - uid, uid_n, id_perm, j0,j0, - (struct gnz*)mat[2].ptr + mat_off[2][lvl], - mat_off[2][lvl+1]-mat_off[2][lvl], buf); - data->Q_Aff[lvl].gsh = gs_setup(ide.ptr,ide.n, &data->comm, 0,gs_auto,1); - if(ide.n>max_e) max_e=ide.n; - } - free(mat_off[0]); - array_free(&ide); - return max_e; -} - -static uint *compute_offsets( - const struct id_data *const id, const uint n, - const int levels) -{ - uint i, *off = tmalloc(uint, levels+1); - int lvl = -1; - for(i=0;icomm); - data->umap = assign_dofs(&uid,0, id,n,data->comm.id,data->gs_top,&cr.data); - data->un = uid.n; - - sortp_long(&cr.data,0, uid.ptr,uid.n,sizeof(ulong)); - sarray_permute(ulong,uid.ptr ,uid.n, cr.data.ptr, &temp_long); - sarray_permute(uint ,data->umap,uid.n, cr.data.ptr, &max_e); - - read_data(data, &ids, mat, &cr, uid.ptr,uid.n); - - /* we should have data for every uid; - if not, then the data is for a smaller problem than we were given */ - { int not_happy = ids.n==uid.n ? 0 : 1; - if(comm_reduce_int(&data->comm, gs_max, ¬_happy,1)) { - comm_barrier(&data->comm); - if(data->comm.id==0) - fail(1,__FILE__,__LINE__,"AMG: missing data for some rows"); - else die(1); - } - } - - sarray_sort(struct id_data,ids.ptr,ids.n, id,1, &cr.data); - sarray_sort(struct id_data,ids.ptr,ids.n, level,0, &cr.data); - sarray_permute(uint,data->umap,ids.n, cr.data.ptr, &max_e); - id_perm = tmalloc(uint, uid.n); - sarray_perm_invert(id_perm, cr.data.ptr, uid.n); - /* the global id uid[i] will have local index id_perm[i] - (the local storage is sorted by level) */ - - data->lvl_offset = compute_offsets(ids.ptr,ids.n, data->levels); - - max_e = amg_setup_mats(data,uid.ptr,uid.n,id_perm,ids.ptr,mat, &cr.data); - - { const unsigned levels=data->levels; - const uint *const off = data->lvl_offset; - struct id_data *const id = ids.ptr; const uint n = ids.n; - double *d; - uint i,max_f=0; - for(i=0;imax_f) max_f=nf; - } - d = data->Dff = tmalloc(double, 3*n + 3*max_f + max_e + 6*(levels-1)); - data->x = data->Dff + n; - data->b = data->x + n; - data->c = data->b + n; - data->c_old = data->c + max_f; - data->r = data->c_old + max_f; - data->buf = data->r + max_f; - data->timing = data->buf + max_e; - for(i=0;itiming[i]=0; - data->timing_n=0; - } - - free(id_perm); - array_free(&ids); - array_free(&mat[0]); - array_free(&mat[1]); - array_free(&mat[2]); - array_free(&uid); - crystal_free(&cr); -} - -static void amg_dump( - uint n, const ulong *id, - uint nz, const uint *Ai, const uint *Aj, const double *A, - struct crs_data *data); - -struct crs_data *crs_setup( - uint n, const ulong *id, - uint nz, const uint *Ai, const uint *Aj, const double *A, - uint null_space, const struct comm *comm) -{ - struct crs_data *data = tmalloc(struct crs_data,1); - -#ifdef AMG_DUMP - int dump=1; -#else - int dump=0; -#endif - - comm_dup(&data->comm,comm); - - data->gs_top = gs_setup((const slong*)id,n, &data->comm, 1, - dump?gs_crystal_router:gs_auto, !dump); - - if(dump) { - amg_dump(n,id,nz,Ai,Aj,A,data); - gs_free(data->gs_top); - - if(data->comm.id==0) printf("AMG dump successful\n"), fflush(stdout); - comm_barrier(&data->comm); - comm_free(&data->comm); - free(data); - die(0); - } else { - data->null_space = null_space; - amg_setup_aux(data, n,id); - } - return data; -} - -void crs_free(struct crs_data *data) -{ - const unsigned levels = data->levels; - - free(data->Dff); - - if(levels>1) { - unsigned lvl; - for(lvl=0;lvlQ_Aff[lvl].gsh), - gs_free(data->Q_AfP[lvl].gsh), - gs_free(data->Q_W[lvl].gsh); - free(data->W[0].a); - free(data->W[0].row_off); - } - - free(data->W); - free(data->Q_W); - - free(data->lvl_offset); - - free(data->cheb_m); - free(data->cheb_rho); - - free(data->umap); - gs_free(data->gs_top); - comm_free(&data->comm); - - free(data); -} - -/*========================================================================== - - Find ID - - As proc 0 reads in the files, it needs to know where to send the data. - The find_id function below takes a sorted list of id's (with no repeats) - and outputs the corresponding list of owning procs. - - ==========================================================================*/ - -struct find_id_map { ulong id; uint p; }; - -struct find_id_data { - struct array map; - struct array work; - struct crystal *cr; -}; - -static void find_id_setup( - struct find_id_data *const data, - const ulong *id, uint n, - struct crystal *const cr) -{ - const uint np = cr->comm.np; - uint i; struct find_id_map *q; - - data->cr = cr; - data->work.ptr=0, data->work.n=0, data->work.max=0; - array_init(struct find_id_map, &data->map, n); - data->map.n=n; - for(q=data->map.ptr,i=0;imap,p,1,cr); - sarray_sort(struct find_id_map,data->map.ptr,data->map.n, id,1, &cr->data); -} - -static void find_id_free(struct find_id_data *const data) -{ - array_free(&data->map); - array_free(&data->work); -} - -struct find_id_work { ulong id; uint p; uint wp; }; - -static int find_id( - uint *p_out, const unsigned p_stride, - struct find_id_data *const data, - const ulong *id, const unsigned id_stride, const uint n) -{ - struct find_id_work *p, *p_end; - const struct find_id_map *q=data->map.ptr, *const q_end = q+data->map.n; - const uint np = data->cr->comm.np; - int not_found = 0; - - uint nn; - p = array_reserve(struct find_id_work, &data->work, n); - for(nn=n;nn;--nn,id=(const ulong*)((const char*)id+id_stride)) - p->id=*id, p->p=0, p->wp = (*id)%np, ++p; - data->work.n=n; - - /* send to work proc */ - sarray_transfer(struct find_id_work,&data->work,wp,1,data->cr); - - /* match id's with rid's */ - sarray_sort(struct find_id_work,data->work.ptr,data->work.n, id,1, - &data->cr->data); - for(p=data->work.ptr,p_end=p+data->work.n;p!=p_end;++p) { - while(q!=q_end && q->idid) ++q; - if(q==q_end) break; - if(q->id!=p->id) not_found=1,p->p=-(uint)1; - else p->p=q->p; - } - for(;p!=p_end;++p) not_found=1,p->p=-(uint)1; - - /* send back */ - sarray_transfer(struct find_id_work,&data->work,wp,0,data->cr); - - /* map back to user data */ - sarray_sort(struct find_id_work,data->work.ptr,data->work.n, id,1, - &data->cr->data); - p = data->work.ptr; - for(nn=n;nn;--nn,p_out=(uint*)((char*)p_out+p_stride)) - *p_out = p->p, ++p; - - return comm_reduce_int(&data->cr->comm, gs_max, ¬_found,1); -} - - -/*========================================================================== - - File I/O - - small wrappers to read/write files consisting of doubles - first double is 3.14159 --- a marker to test for endianness - - ==========================================================================*/ - -/* reverse the byte order of the given double - portable up to 129-byte doubles, standards conforming */ -#define N sizeof(double) -static double byteswap(double x) -{ - char t, buf[N]; - memcpy(buf,&x,N); -#define SWAP1(i) if(N>2*(i)+1) t=buf[i],buf[i]=buf[N-1-(i)],buf[N-1-(i)]=t -#define SWAP2(i) SWAP1(i); SWAP1((i)+0x01); SWAP1((i)+0x02); SWAP1((i)+0x03) -#define SWAP3(i) SWAP2(i); SWAP2((i)+0x04); SWAP2((i)+0x08); SWAP2((i)+0x0c) -#define SWAP4(i) SWAP3(i); SWAP3((i)+0x10); SWAP3((i)+0x20); SWAP3((i)+0x30) - SWAP4(0); -#undef SWAP1 -#undef SWAP2 -#undef SWAP3 -#undef SWAP4 - memcpy(&x,buf,N); - return x; -} -#undef N - -struct file { - FILE *fptr; - int swap; -}; - -static int rfread(void *ptr, size_t n, FILE *const fptr) -{ - size_t na; char *p=ptr; - while(n && (na=fread (p,1,n,fptr))) n-=na, p+=na; - return n!=0; -} - -static int rfwrite(FILE *const fptr, const void *ptr, size_t n) -{ - size_t na; const char *p=ptr; - while(n && (na=fwrite(p,1,n,fptr))) n-=na, p+=na; - return n!=0; -} - - -static struct file dopen(const char *filename, const char mode,int *code) -{ - const double magic = 3.14159; - double t; - struct file f; - f.fptr = fopen(filename,mode=='r'?"r":"w"); - f.swap = 0; - if(f.fptr==0) { - diagnostic("ERROR ",__FILE__,__LINE__, - "AMG: could not open %s for %s", - filename,mode=='r'?"reading":"writing"); - *code=1; - return f; - } - if(mode=='r') { - if(rfread(&t,sizeof(double), f.fptr)){ - diagnostic("ERROR ",__FILE__,__LINE__, - "AMG: could not read from %s",filename); - *code=1; - return f; - } - if(fabs(t-magic)>0.000001) { - t=byteswap(t); - if(fabs(t-magic)>0.000001) { - diagnostic("ERROR ",__FILE__,__LINE__, - "AMG: magic number for endian test not found in %s", - filename); - *code=1; - return f; - } - f.swap = 1; - } - } else { - if(rfwrite(f.fptr, &magic,sizeof(double))){ - fail("ERROR ",__FILE__,__LINE__, - "AMG: could not write to %s",filename); - *code=1; - return f; - } - } - return f; -} - -static void dread(double *p, size_t n, const struct file f,int *code) -{ - if(rfread(p,n*sizeof(double),f.fptr)) { - diagnostic("ERROR ",__FILE__,__LINE__, - "AMG: failed reading %u doubles from disk",(unsigned)n); - *code=1; - return; - } - if(f.swap) while(n) *p=byteswap(*p), ++p, --n; -} - -static void dwrite(const struct file f, const double *p, size_t n,int *code) -{ - if(rfwrite(f.fptr, p,n*sizeof(double))) { - diagnostic("ERROR ",__FILE__,__LINE__, - "AMG: failed writing %u doubles to disk",(unsigned)n); - *code=1; - } -} - -static void dclose(const struct file f) -{ - fclose(f.fptr); -} - -/*========================================================================== - - Read amg.dat, amg_*.dat - - The function read_data is responsible for reading these files, - and creating arrays - ids of struct id_data - mat[3] of struct gnz (W, AfP, Aff) - distributed to the appropriate procs - - ==========================================================================*/ - -static ulong read_level_data(struct crs_data *data, struct file f) -{ - unsigned i,n; ulong tn; double *buf; - int updt=1; - int code=0; - if(data->comm.id==0) { -//Check header for newest amg_matlab code. - double hdr; dread(&hdr,1,f,&code); - if(code==0) { - printf("AMG version %3.2f\n",hdr); - if(hdr!=2.01) { - printf("Update amg_matlab tool and create new .dat files before re-running\n"); - updt=0; - } - double t; dread(&t,1,f,&code); - data->levels = t; - printf("AMG: %u levels\n", data->levels); - } - } - comm_bcast(&data->comm, &data->levels,sizeof(unsigned), 0); - comm_bcast(&data->comm, &updt,sizeof(int), 0); - comm_bcast(&data->comm, &code,sizeof(int), 0); - if(!updt||code!=0) die(1); - - n = data->levels-1; - data->cheb_m = tmalloc(unsigned,n); - data->cheb_rho = tmalloc(double ,n); - - buf = tmalloc(double, 2*n+1); - if(data->comm.id==0) dread(buf,2*n+1,f,&code); - comm_bcast(&data->comm, &code,sizeof(int), 0); - if(code!=0) die(1); - - comm_bcast(&data->comm, buf,(2*n+1)*sizeof(double), 0); - for(i=0;icheb_m[i] = buf[i]; - for(i=0;icheb_rho[i] = buf[n+i]; - tn = buf[2*n]; - data->tni = 1/(double)tn; - free(buf); - - if(data->comm.id==0) { - printf("AMG Chebyshev smoother data:\n"); - for(i=0;icheb_m[i], (double)data->cheb_rho[i]); - printf("AMG: %lu rows\n", (unsigned long)tn); - } - - return tn; -} - -static void read_data( - struct crs_data *const data, - struct array *ids, struct array mat[3], - struct crystal *const cr, - const ulong *uid, const uint uid_n) -{ - int code; - struct find_id_data fid; - const uint pid = data->comm.id; - ulong r,tn; - struct array read_buffer=null_array; - struct array id_buffer = null_array, mat_buffer = null_array; - uint *row_lens=0, *id_proc=0, *id_perm=0; - struct array mat_proc = null_array; - struct file f={0,0}, fm[3]={{0,0},{0,0},{0,0}}; - unsigned m; - ids->ptr=0, ids->n=0, ids->max=0; - for(m=0;m<3;++m) mat[m].ptr=0,mat[m].n=0,mat[m].max=0; - find_id_setup(&fid, uid,uid_n, cr); - code=0; - if(pid==0) { - f = dopen("amg.dat",'r',&code); - fm[0] = dopen("amg_W.dat",'r',&code); - fm[1] = dopen("amg_AfP.dat",'r',&code); - fm[2] = dopen("amg_Aff.dat",'r',&code); - if(code==0) { - array_init(double, &read_buffer, 6*AMG_BLOCK_ROWS); - row_lens = tmalloc(uint, 5*AMG_BLOCK_ROWS); - id_proc = row_lens + 3*AMG_BLOCK_ROWS; - id_perm = id_proc + AMG_BLOCK_ROWS; - array_reserve(struct id_data, &id_buffer, AMG_BLOCK_ROWS); - } - } - comm_bcast(&data->comm,&code,sizeof(int),0); - if(code!=0) die(1); - - tn = read_level_data(data,f); - for(r=0;rAMG_BLOCK_ROWS ? AMG_BLOCK_ROWS : (unsigned)(tn-r); - uint mat_size[3]={0,0,0}; - - /* read id data */ - if(pid==0) { - unsigned i; - struct id_data *idp = id_buffer.ptr; - double *b = read_buffer.ptr; - dread(b,nr*6, f,&code); - if(code==0) { - printf("AMG: reading through row %lu, pass %u/%u\n", - (unsigned long)b[(nr-1)*6], - (unsigned)(r/AMG_BLOCK_ROWS+1), - (unsigned)((tn+AMG_BLOCK_ROWS-1)/AMG_BLOCK_ROWS)); - for(i=0;idata); - sarray_perm_invert(id_perm, cr->data.ptr, nr); - } - } else - id_buffer.n=0; - - comm_bcast(&data->comm,&code,sizeof(int),0); - if(code!=0)die(1); - - /* find who owns each row */ - if(find_id(id_proc,sizeof(uint), &fid, - (const ulong*)((const char*)id_buffer.ptr+offsetof(struct id_data,id)), - sizeof(struct id_data), id_buffer.n)) { - if(pid==0) - fail(1,__FILE__,__LINE__,"AMG: data has more rows than given problem"); - else die(1); - } - if(pid==0) { /* undo sorting of id_buffer */ - buffer_reserve(&cr->data,sizeof(struct id_data)+sizeof(uint)); - sarray_permute(struct id_data,id_buffer.ptr,nr, id_perm, cr->data.ptr); - sarray_permute(uint, id_proc, nr, id_perm, cr->data.ptr); - } - /* read matrix data */ - for(m=0;m<3;++m) { - if(pid==0) { - const struct id_data *const idp = id_buffer.ptr; - unsigned i; uint j,rl; - struct gnz *p = array_reserve(struct gnz, &mat_buffer, mat_size[m]); - double *b = array_reserve(double, &read_buffer, 2*mat_size[m]); - uint *proc = array_reserve(uint, &mat_proc, mat_size[m]); - dread(b,2*mat_size[m], fm[m],&code); - if(code==0) { - for(i=0;ii = i_id, p->j = *b++, p->a = *b++, *proc++ = i_p, ++p; - } - mat_buffer.n = mat_size[m]; - } - } else - mat_buffer.n = 0; - comm_bcast(&data->comm,&code,sizeof(int),0); - if(code!=0)die(1); - sarray_transfer_ext(struct gnz,&mat_buffer,mat_proc.ptr,sizeof(uint),cr); - array_cat(struct gnz,&mat[m],mat_buffer.ptr,mat_buffer.n); - } - /* send id_data to owner */ - sarray_transfer_ext(struct id_data,&id_buffer,id_proc,sizeof(uint),cr); - array_cat(struct id_data,ids,id_buffer.ptr,id_buffer.n); - } - array_free(&id_buffer); - array_free(&mat_buffer); - if(pid==0) { - array_free(&read_buffer); - free(row_lens); - array_free(&mat_proc); - dclose(fm[2]); - dclose(fm[1]); - dclose(fm[0]); - dclose(f); - } - find_id_free(&fid); -} - -/*========================================================================== - - Write amgdmp_*.dat - - The user's matrix is first assembled, then written out. - - ==========================================================================*/ - - -enum mat_order { row_major, col_major }; -enum distr { row_distr, col_distr }; - -#define rid_equal(a,b) ((a).p==(b).p && (a).i==(b).i) - -/* rnz is a mnemonic for remote non zero */ -struct rnz { - double v; struct rid i,j; -}; -#define nz_pos_equal(a,b) \ - (rid_equal((a).i,(b).i) && rid_equal((a).j,(b).j)) - -static void mat_sort(struct array *const mat, - const enum mat_order order, buffer *const buf) -{ - switch(order) { - case col_major: sarray_sort_4(struct rnz,mat->ptr,mat->n, - j.p,0,j.i,0, i.p,0,i.i,0, buf); break; - case row_major: sarray_sort_4(struct rnz,mat->ptr,mat->n, - i.p,0,i.i,0, j.p,0,j.i,0, buf); break; - } -} - -/* assumes matrix is already sorted */ -static void mat_condense_sorted(struct array *const mat) -{ - struct rnz *dst,*src, *const end=(struct rnz*)mat->ptr+mat->n; - if(mat->n<=1) return; - for(dst=mat->ptr;;++dst) { - if(dst+1==end) return; - if(nz_pos_equal(dst[0],dst[1])) break; - } - for(src=dst+1;src!=end;++src) { - if(nz_pos_equal(*dst,*src)) - dst->v += src->v; - else - *(++dst) = *src; - } - mat->n = (dst+1)-(struct rnz*)mat->ptr; -} - -static void mat_condense( - struct array *const mat, const enum mat_order order, buffer *const buf) -{ - mat_sort(mat,order,buf); mat_condense_sorted(mat); -} - -static void mat_distribute( - struct array *const mat, const enum distr d, const enum mat_order o, - struct crystal *const cr) -{ - switch(d) { - case row_distr: mat_condense(mat,row_major,&cr->data); - sarray_transfer(struct rnz,mat, i.p,0, cr); break; - case col_distr: mat_condense(mat,col_major,&cr->data); - sarray_transfer(struct rnz,mat, j.p,0, cr); break; - } - mat_condense(mat,o,&cr->data); -} - -struct labelled_rid { - struct rid rid; ulong id; -}; - -static void mat_list_nonlocal_sorted( - struct array *const nonlocal_id, - const struct array *const mat, const enum distr d, - const ulong *uid, struct crystal *const cr) -{ - const uint pid = cr->comm.id; - struct rid last_rid; - const struct rnz *p, *const e=(const struct rnz*)mat->ptr+mat->n; - uint count; struct labelled_rid *out, *end; - #define BUILD_LIST(k) do { \ - last_rid.p=-(uint)1,last_rid.i=-(uint)1; \ - for(count=0,p=mat->ptr;p!=e;++p) { \ - if(p->k.p==pid || rid_equal(last_rid,p->k)) continue; \ - last_rid=p->k; ++count; \ - } \ - array_init(struct labelled_rid, nonlocal_id, count); \ - nonlocal_id->n=count; out=nonlocal_id->ptr; \ - last_rid.p=-(uint)1,last_rid.i=-(uint)1; \ - for(p=mat->ptr;p!=e;++p) { \ - if(p->k.p==pid || rid_equal(last_rid,p->k)) continue; \ - (out++)->rid=last_rid=p->k; \ - } \ - } while(0) - switch(d) { - case row_distr: BUILD_LIST(j); break; - case col_distr: BUILD_LIST(i); break; - } - #undef BUILD_LIST - sarray_transfer(struct labelled_rid,nonlocal_id,rid.p,1,cr); - for(out=nonlocal_id->ptr,end=out+nonlocal_id->n;out!=end;++out) - out->id=uid[out->rid.i]; - sarray_transfer(struct labelled_rid,nonlocal_id,rid.p,1,cr); - sarray_sort_2(struct labelled_rid,nonlocal_id->ptr,nonlocal_id->n, - rid.p,0, rid.i,0, &cr->data); -} - -static void mat_list_nonlocal( - struct array *const nonlocal_id, - struct array *const mat, const enum distr d, - const ulong *uid, struct crystal *const cr) -{ - switch(d) { - case row_distr: mat_sort(mat,col_major,&cr->data); break; - case col_distr: mat_sort(mat,row_major,&cr->data); break; - } - mat_list_nonlocal_sorted(nonlocal_id,mat,d,uid,cr); -} - -static uint dump_matrix_setdata( - buffer *const buf, /* output; ok if this is one of cr's buffers */ - struct array *const mat, const ulong *const uid, - struct crystal *const cr) -{ - const uint pid = cr->comm.id; - struct array nonlocal_id; - double *vi, *vj, *va; uint n; - const struct rnz *nz, *enz; - const struct labelled_rid *rlbl; - - mat_distribute(mat,row_distr,col_major,cr); - n = mat->n; - - mat_list_nonlocal_sorted(&nonlocal_id,mat,row_distr,uid,cr); - - buffer_reserve(buf,3*n*sizeof(double)); - vi=buf->ptr, vj=vi+n, va=vj+n; - rlbl = nonlocal_id.ptr; - for(nz=mat->ptr,enz=nz+n;nz!=enz;++nz) { - *vi++ = uid[nz->i.i]; - *va++ = nz->v; - if(nz->j.p==pid) - *vj++ = uid[nz->j.i]; - else { - const uint jp = nz->j.p, ji = nz->j.i; - while(rlbl->rid.prid.p!=jp) printf("dump_matrix: FAIL!!!\n"); - while(rlbl->rid.irid.i!=ji) printf("dump_matrix: FAIL!!!\n"); - *vj++ = rlbl->id; - } - } - array_free(&nonlocal_id); - return n; -} - -static void dump_matrix( - struct array *const mat, const ulong *const uid, - struct crystal *const cr) -{ - const struct comm *comm = &cr->comm; - const uint pid = comm->id, np = comm->np; - buffer *const buf = &cr->data; - struct file fi={0,0}, fj={0,0}, fp={0,0}; - uint i,n; - int code; - - n = dump_matrix_setdata(buf, mat,uid,cr); - - code = 0; - if(pid==0) { - fi=dopen("amgdmp_i.dat",'w',&code); - fj=dopen("amgdmp_j.dat",'w',&code); - fp=dopen("amgdmp_p.dat",'w',&code); - } - comm_bcast(comm,&code,sizeof(int),0); - if(code!=0) die(1); - - for(i=0;iptr,3*n*sizeof(double), 0,np+i); - else if(pid==0) { - double *v; - printf("AMG writing data from proc %u\n",(unsigned)i),fflush(stdout); - if(i!=0) { - comm_recv(comm, &n,sizeof(uint), i,i); - buffer_reserve(buf,3*n*sizeof(double)); - comm_recv(comm, buf->ptr,3*n*sizeof(double), i,np+i); - } - v = buf->ptr; - if(code==0) { - dwrite(fi, v , n,&code); - dwrite(fj, v+ n, n,&code); - dwrite(fp, v+2*n, n,&code); - } - } - } - if(pid==0) dclose(fi),dclose(fj),dclose(fp); - comm_bcast(comm,&code,sizeof(int),0); - if(code!=0) die(1); - comm_barrier(comm); -} - -/* assumes data->comm and data->gs_top are set */ -static void amg_dump( - uint n, const ulong *id, - uint nz, const uint *Ai, const uint *Aj, const double *A, - struct crs_data *data) -{ - struct crystal cr; - struct array uid; struct rid *rid_map = tmalloc(struct rid,n); - - struct array mat; - uint k; struct rnz *out; - - crystal_init(&cr, &data->comm); - assign_dofs(&uid,rid_map, id,n,data->comm.id,data->gs_top,&cr.data); - - array_init(struct rnz,&mat,nz); - for(out=mat.ptr,k=0;kv = a, out->i=rid_map[i], out->j=rid_map[j]; - ++out; - } - mat.n = out-(struct rnz*)mat.ptr; - free(rid_map); - - dump_matrix(&mat,uid.ptr,&cr); - - array_free(&uid); - crystal_free(&cr); -} diff --git a/3rdParty/gslib.github/src/amg_dump.c b/3rdParty/gslib.github/src/amg_dump.c deleted file mode 100644 index e353be59e..000000000 --- a/3rdParty/gslib.github/src/amg_dump.c +++ /dev/null @@ -1,153 +0,0 @@ -static void condense_matrix(tuple_list *mat, buffer *buf) -{ - unsigned mi=mat->mi,ml=mat->ml,mr=mat->mr; - slong *vl=mat->vl; - uint si,di,n=mat->n; - tuple_list_sort(mat,mi+1,buf); - tuple_list_sort(mat,mi+0,buf); - for(di=0,si=0; sivr[mr*di]+=mat->vr[mr*si]; - } else { - ++di; - if(di!=si) { - memcpy(mat->vi+mi*di,mat->vi+mi*si,mi*sizeof(sint)); - memcpy(mat->vl+ml*di,mat->vl+ml*si,ml*sizeof(slong)); - memcpy(mat->vr+mr*di,mat->vr+mr*si,mr*sizeof(real)); - } - } - } - if(n) mat->n=di+1; -} - -#ifndef MPI -# define MPI_Comm int -#endif -static void write_matrix(tuple_list *mat, buffer *buf, MPI_Comm comm, - uint pid, uint np) -{ - FILE *fi,*fj,*fp; - uint i,n=mat->n; - double *v; - buffer_reserve(buf,3*n*sizeof(double)); - v = buf->ptr; - for(i=0;ivl[2*i+0]; - for(i=0;ivl[2*i+1]; - for(i=0;ivr[i]; - v = buf->ptr; - if(pid==0) { - const double magic = 3.14159; - fi=fopen("amgdmp_i.dat","w"); - fj=fopen("amgdmp_j.dat","w"); - fp=fopen("amgdmp_p.dat","w"); - fwrite(&magic,sizeof(double),1,fi); - fwrite(&magic,sizeof(double),1,fj); - fwrite(&magic,sizeof(double),1,fp); - } - for(i=0;iall->buf; -#else - int comm=0; - buffer buf_static, *buf=&buf_static; -#endif -#ifdef MPI - MPI_Comm_dup(crystal->comm,&comm); - pid = crystal->id, np = crystal->num; -#else - pid = 0, np = 1; - buffer_init(buf,1024); -#endif - - perm = tmalloc(sint,n); - setup_dofs(&cdof,perm, n,id, pid, crystal,buf); - tuple_list_init_max(&mat,1,2,1,nz), mat.n=0; - for(i=0;inloc>Q->n) return 1; - if(Q->gs) return 0; - return 1; -} -#endif - -/* ve = Q v */ -static double apply_Q(real *ve, const amg_Q *Q, const real *v, mpicomm_t comm) -{ - double t0=0,t1=0; - uint i, nloc=Q->nloc,n=Q->n; - for(i=0;igs);*/ -#if !defined(USE_FLOAT) - jl_gs(ve,gs_double,gs_add,0,Q->jgs,0); -#else - jl_gs(ve,gs_float,gs_add,0,Q->jgs,0); -#endif - -# ifdef GS_TIMING - t1 = MPI_Wtime() - t0; -# endif -# endif - return t1; -} - -/* z := alpha y + beta Q^t x - (x := Q Q^t x as a side effect) - */ -static double apply_Qt(real *z, real alpha, const real *y, - real beta, const amg_Q *Q, real *x, mpicomm_t comm) -{ - double t0=0,t1=0; - uint i, nloc=Q->nloc; -# ifdef MPI -# ifdef GS_BARRIER - MPI_Barrier(comm); -# endif -# ifdef GS_TIMING - t0 = MPI_Wtime(); -# endif - /*gs_op(x,GS_OP_ADD,Q->gs);*/ -#if !defined(USE_FLOAT) - jl_gs(x,gs_double,gs_add,1,Q->jgs,0); -#else - jl_gs(x,gs_float,gs_add,1,Q->jgs,0); -#endif -# ifdef GS_TIMING - t1 = MPI_Wtime() - t0; -# endif -# endif - for(i=0;irn;++i) if(M->row_off[i+1]row_off[i]) return 1; - n = M->row_off[M->rn]; - for(i=0;icol[i]>=M->cn) return 1; - if(isnan(M->pr[i])) return 1; - } - return 0; -} -#endif - -/* z = alpha y + beta M x */ -static double apply_M(real *z, real alpha, const real *y, - real beta, const amg_mat *M, const real *x) -{ - uint i,rn=M->rn; - const uint *row_off = M->row_off, *col = M->col; - const real *pr = M->pr; -# ifdef GS_TIMING - double time0 = MPI_Wtime(); -# endif - for(i=0;irn,cn=M->cn; - const uint *row_off = M->row_off, *col = M->col; - const real *pr = M->pr; -# ifdef GS_TIMING - double time0 = MPI_Wtime(); -# endif - - for(i=0;itn)) printf("AMG error: tn not normal (%u)\n",data->pid); - if(data->cn>=data->un) - printf("AMG error: user -> condensed permutation wrongly sized (%u)\n", - data->pid); - for(i=0;iun;++i) - if(data->perm[i] < -1 || data->perm[i] >= (sint)data->cn) - printf("AMG error: user -> condensed permutation not consistent (%u)\n", - data->pid); - for(lvl=0;lvllevels-1;++lvl) { - uint fn = data->lvl_offset[lvl+1] - data->lvl_offset[lvl]; - uint cn = data->lvl_offset[data->levels] - data->lvl_offset[lvl+1]; - if(data->W[lvl].rn != fn || data->Q_W[lvl].nloc != cn || - data->W[lvl].cn != data->Q_W[lvl].n) - printf("AMG error: W %u matrix sized wrongly (%u)\n",lvl,data->pid); - if(data->AfP[lvl].rn != fn || data->Q_AfP[lvl].nloc != cn || - data->AfP[lvl].cn != data->Q_AfP[lvl].n) - printf("AMG error: AfP %u matrix sized wrongly (%u)\n",lvl,data->pid); - if(data->Aff[lvl].rn != fn || data->Q_Aff[lvl].nloc != fn || - data->Aff[lvl].cn != data->Q_Aff[lvl].n) - printf("AMG error: Aff %u matrix sized wrongly (%u)\n",lvl,data->pid); - } - n = 3*(data->levels-1); - for(i=0;iQ_W[i])) - printf("AMG error: communication (%u) data not consistent (%u)\n", - i,data->pid); - if(check_M(&data->W[i])) - printf("AMG error: matrix (%u) data not consistent (%u)\n",i,data->pid); - } - for(i=0;ilvl_offset[data->levels];++i) - if(isnan(data->Dff[i])) - printf("AMG error: NAN in Dff matrix %u (%u)\n",i,data->pid); - for(lvl=data->levels-1;lvl;) { - unsigned ci,m; real alpha, gamma, beta; - --lvl; - m = data->cheb_m[lvl]; - if(m>1) { - alpha = data->cheb_rho[lvl]/2, alpha *= alpha; - gamma = 2*alpha/(1-2*alpha), beta = 1 + gamma; - if(isnan(beta)) - printf("AMG error: Chebyshev coefficient %u NAN (%u)\n",lvl,data->pid); - } - for(ci=3;ci<=m;++ci) { - gamma = alpha*beta/(1-alpha*beta), beta = 1 + gamma; - if(isnan(beta) || isnan(gamma)) - printf("AMG error: Chebyshev coefficient %u NAN (%u)\n",lvl,data->pid); - } - } -} -#endif - -void amg_stats(amg_data *data) -{ -#ifdef GS_TIMING - unsigned lvl, lm1 = data->levels - 1; - double *global, *local; - if(data->pid==0) global = tmalloc(double,9*lm1*data->np); - MPI_Gather(data->timing,6*lm1,MPI_DOUBLE, - global, 6*lm1,MPI_DOUBLE, - 0,data->comm); - if(data->pid==0) { -# define DOPRINT(name,i) do { \ - uint p; \ - double *timing = global; \ - for(p=0;pnp;++p,timing+=6*lm1) { \ - double *t = timing; \ - printf("%9d",(unsigned)p); \ - for(lvl=0;lvltiming_n); \ - puts(" AMG " name); \ - } \ - } while(0) - DOPRINT("Wt work",1); - DOPRINT("W,AfP work",3); - DOPRINT("Aff work",5); - DOPRINT("Wt comm",0); - DOPRINT("W,AfP comm",2); - DOPRINT("Aff comm",4); -# undef DOPRINT - } - local = tmalloc(double,9*lm1); - for(lvl=0;lvl<9*lm1;++lvl) local[lvl]=0; - for(lvl=0;lvlQ_W[lvl].gs) - gs_data_stats(local + 9*lvl + 0, data->Q_W[lvl].gs); - if(data->Q_AfP[lvl].gs) - gs_data_stats(local + 9*lvl + 3, data->Q_AfP[lvl].gs); - if(data->Q_Aff[lvl].gs) - gs_data_stats(local + 9*lvl + 6, data->Q_Aff[lvl].gs); - } - MPI_Gather(local ,9*lm1,MPI_DOUBLE, - global,9*lm1,MPI_DOUBLE, - 0,data->comm); - if(data->pid==0) { -# define DOPRINT(name,i,fmt,type) do { \ - uint p; \ - double *gsdata = global; \ - for(p=0;pnp;++p,gsdata+=9*lm1) { \ - double *t = gsdata; \ - printf("%9d",(unsigned)p); \ - for(lvl=0;lvlpid==0) free(global); -#endif -} - -static void amg_exec(amg_data *data) -{ - unsigned lvl, levels = data->levels; - const uint *off = data->lvl_offset; - uint off_bot; - real *b = data->b, *x = data->x; - real *c = data->c, *c_old = data->c_old, *r = data->r; - double *timing = data->timing; -#ifdef MPI - mpicomm_t comm = data->comm; -#else - mpicomm_t comm = 0; -#endif - for(lvl=0;lvlbuf, &data->W[lvl],b_l); - timing[0] += apply_Qt(b_lp1, 1,b_lp1, 1,&data->Q_W[lvl],data->buf,comm); - } - if(off[levels]-(off_bot=off[levels-1])) - x[off_bot] = data->Dff[off_bot]*b[off_bot]; - for(lvl=levels-1;lvl;) { - real *b_l, *x_l, *x_lp1, *d_l; uint i,n; - unsigned ci,m; real alpha, gamma, beta; - --lvl; - timing = data->timing + lvl*6; - b_l = b+off[lvl], x_l = x+off[lvl], x_lp1 = x+off[lvl+1]; - d_l = data->Dff+off[lvl]; - n = off[lvl+1]-off[lvl]; - m = data->cheb_m[lvl]; - /* buf = Q x_{l+1} */ - timing[2] += apply_Q(data->buf, &data->Q_AfP[lvl], x_lp1, comm); - /* x_l = W x_{l+1} */ - /* careful: x_l not initialized; 0*x_l could contain nans */ - timing[3] += apply_M(x_l, 0,b_l, 1,&data->W[lvl],data->buf); - /* b_l -= AfP x_{l+1} */ - timing[3] += apply_M(b_l, 1,b_l, -1,&data->AfP[lvl],data->buf); - /* c_1 = D b_l */ - for(i=0;i1) { - alpha = data->cheb_rho[lvl]/2, alpha *= alpha; - gamma = 2*alpha/(1-2*alpha), beta = 1 + gamma; - /* r_1 = b_l - Aff c_1 */ - timing[4] += apply_Q(data->buf, &data->Q_Aff[lvl],c, comm); - timing[5] += apply_M(r, 1,b_l, -1,&data->Aff[lvl],data->buf); - /* c_2 = (1+gamma) (c_1 + D r_1) */ - for(i=0;ibuf, &data->Q_Aff[lvl],c, comm); - timing[5] += apply_M(r, 1,b_l, -1,&data->Aff[lvl],data->buf); - /* c_{i+1} = (1+gamma) (c_i + D r_i) - gamma c_{i-1} */ - for(i=0;itiming_n++; -} - -void amg_solve(real *x, amg_data *data, const real *b) -{ - uint i, un=data->un, cn=data->cn, ln=data->lvl_offset[data->levels]; - real *cb = data->b, *cx = data->x; - for(i=0;iperm[i]; - if(p!=-1) cb[p] += b[i]; - } - jl_gs(cb,gs_double,gs_add,0,data->gs_top,0); - amg_exec(data); - if(data->null_space) { - real avg = 0, sum; - for(i=0;icomm); -#endif - avg = sum/data->tn; - for(i=0;igs_top,0); - for(i=0;iperm[i]; - x[i] = (p == -1 ? 0 : cx[p]); - } -} - -/* reverse the byte order of the given double - portable up to 129-byte doubles, standards conforming */ -#define N sizeof(double) -static double byteswap(double x) -{ - char t, buf[N]; - memcpy(buf,&x,N); -#define SWAP1(i) if(N>2*(i)+1) t=buf[i],buf[i]=buf[N-1-(i)],buf[N-1-(i)]=t -#define SWAP2(i) SWAP1(i); SWAP1((i)+0x01); SWAP1((i)+0x02); SWAP1((i)+0x03) -#define SWAP3(i) SWAP2(i); SWAP2((i)+0x04); SWAP2((i)+0x08); SWAP2((i)+0x0c) -#define SWAP4(i) SWAP3(i); SWAP3((i)+0x10); SWAP3((i)+0x20); SWAP3((i)+0x30) - SWAP4(0); -#undef SWAP1 -#undef SWAP2 -#undef SWAP3 -#undef SWAP4 - memcpy(&x,buf,N); - return x; -} -#undef N - -typedef struct { - FILE *fptr; - int swap; -} amg_file; - -static void amg_fopen(amg_file *f, const char *filename) -{ - const double magic = 3.14159; - double t; - f->fptr = fopen(filename,"r"); - if(f->fptr==0) fail("AMG: could not open %s for reading\n",filename); - fread(&t,sizeof(double),1,f->fptr); - if(fabs(t-magic)>0.000001) { - t=byteswap(t); - if(fabs(t-magic)>0.000001) - fail("AMG: magic number for endian test not found in %s\n",filename); - f->swap = 1; - } else { - f->swap = 0; - } -} - -static void amg_fread(double *ptr, size_t n, amg_file *f) -{ - size_t nread = fread(ptr,sizeof(double),n,f->fptr); - if(!nread && n) fail("AMG: failed reading %u doubles from disk\n",n); - if(f->swap) { - size_t i; - for(i=0;ifptr); -} - -static void arrange_cdof_by_level(amg_data *data, tuple_list *cdof, buffer *buf) -{ - uint i; unsigned level; - - tuple_list_sort(cdof, cdof_level, buf); - - data->lvl_offset = tmalloc(uint, data->levels+1); - data->lvl_offset[0] = 0; level=1; - for(i=0;in;++i) { - sint *t = &cdof->vi[cdof_mi*i]; - sint l=t[cdof_level], proc=t[cdof_proc]; - if(proc) break; - t[cdof_index]=i; - if(l>=level) for(;level<=l;++level) data->lvl_offset[level]=i; - } - for(;level<=data->levels;++level) data->lvl_offset[level]=i; - for(;in;++i) cdof->vi[cdof_mi*i+cdof_index]=i; -} - -static const int mat_mi=2, mat_ml=1, mat_mr=1; -static const int mat_ridx=0, mat_cidx=1; - -static void add_ids(tuple_list *col_id, tuple_list *mat, buffer *buf) -{ - slong last_id = -1; uint last_cidx; - uint i,n = mat->n; - sint *mat_vi = mat->vi; slong *mat_vl = mat->vl; - uint ci=0,cn = col_id->n; - sint *col_vi = col_id->vi; slong *col_vl = col_id->vl; - tuple_list_sort(col_id,1,buf); - tuple_list_sort(mat,mat_mi,buf); - for(i=0;in++; - if(cidn==col_id->max) { - tuple_list_grow(col_id); - col_vi = col_id->vi+ci, col_vl = col_id->vl+ci; - } - mat_vi[mat_cidx] = col_id->vi[cidn] = last_cidx = cidn; - col_id->vl[cidn] = id; - } - } -} - -static void organize_matrix(amg_Q *Q, amg_mat *M, - uint rn, uint cnloc, real cnglob, - tuple_list *id, tuple_list *mat, - crystal_data *crystal, buffer *buf) -{ - uint ri,i,n, *row_off, *col; real *pr; - sint *mat_vi; real *mat_vr; - add_ids(id,mat,buf); - tuple_list_sort(id,0,buf); - - Q->nloc = cnloc; - Q->n = id->n; - /*Q->gs = gs_data_setup(id->n,(ulong*)id->vl,1,crystal);*/ - { -#ifdef MPI - jl_comm_t comm = {crystal->id,crystal->num,crystal->comm}; -#else - jl_comm_t comm = {0,1,0}; -#endif - uint i,ie; - for(i=cnloc,ie=id->n;i!=ie;++i) id->vl[i] = -id->vl[i]; - Q->jgs = jl_gs_setup(id->vl,id->n,&comm); - for(i=cnloc,ie=id->n;i!=ie;++i) id->vl[i] = -id->vl[i]; - } - - M->rn = rn; - M->cn = id->n; - row_off = M->row_off = tmalloc(uint,(rn+1)+mat->n); - col = M->col = M->row_off + (rn+1); - pr = M->pr = tmalloc(real,mat->n); - tuple_list_sort(mat,mat_cidx,buf); - tuple_list_sort(mat,mat_ridx,buf); - mat_vi = mat->vi, mat_vr = mat->vr; - ri = 0, n = mat->n; - for(i=0;ilevels; - uint nloc; - uint *perm_level; - real *gln, *gln_in; - tuple_list mat[3]; - tuple_list C_id, F_id; uint Fnloc, Cnloc; real Fnglob, Cnglob; - for(i=0;in;++i) - if(cdof->vi[cdof_mi*i+cdof_proc]) - cdof->vi[cdof_mi*i+cdof_level] = nl; - - arrange_cdof_by_level(data,cdof,buf); - - gln = tmalloc(real, nl); - buffer_reserve(buf, nl*sizeof(real)); gln_in = buf->ptr; - for(i=0;ilvl_offset[i+1]-data->lvl_offset[i]; -#ifdef MPI - MPI_Allreduce(gln_in,gln,nl,REAL_MPI,MPI_SUM,data->comm); -#endif - if(data->pid==0) - for(i=0;ilvl_offset[nl]; - data->Q_W = tmalloc(amg_Q,3*(nl-1)); - data->Q_AfP = data->Q_W + (nl-1); - data->Q_Aff = data->Q_AfP + (nl-1); - data->W = tmalloc(amg_mat,3*(nl-1)); - data->AfP = data->W + (nl-1); - data->Aff = data->AfP + (nl-1); - - data->Dff = tmalloc(real, nloc); - for(i=0;iDff[i] = cdof->vr[i]; - - tuple_list_init(&C_id,1,1,0); - tuple_list_init(&F_id,1,1,0); - for(m=0;m<3;++m) { - uint size=0; - for(i=0;ivi+cdof_level,nloc,cdof_mi, perm_level, buf->ptr); - tuple_list_sort(cdof, cdof_index, buf); - - for(lvl=0;lvllvl_offset[lvl],ke=data->lvl_offset[lvl+1],k; - for(m=0;m<3;++m) mat[m].n=0; - for(k=kb;k!=ke;++k) { - uint i = perm_level[k]; - uint ridx = k-kb; - for(m=0;m<3;++m) { - uint j,jb=row_off[m][i],je=row_off[m][i+1]; - sint *vi; slong *vl; real *vr; - tuple_list_reserve(&mat[m],mat[m].n+(je-jb)); - vi = mat[m].vi+mat_mi*mat[m].n; - vl = mat[m].vl+mat_ml*mat[m].n; - vr = mat[m].vr+mat_mr*mat[m].n; - mat[m].n+=je-jb; - for(j=jb;j!=je;++j,vi+=mat_mi,vl+=mat_ml,vr+=mat_mr) - vi[mat_ridx]=ridx, vi[mat_cidx]=0, - vl[0]=raw_mat[m][2*j], vr[0]=raw_mat[m][2*j+1]; - } - } - Fnloc = F_id.n = ke-kb; Fnglob = gln[lvl]; - tuple_list_reserve(&F_id,F_id.n); - for(k=kb;k!=ke;++k) { - F_id.vi[k-kb]=k-kb; - F_id.vl[k-kb]=cdof->vl[k]; - } - kb = ke; ke = data->lvl_offset[nl]; - Cnloc = C_id.n = ke-kb; - Cnglob = 0; - for(k=lvl+1;kvl[k]; - } - - organize_matrix(&data->Q_W[lvl],&data->W[lvl], - Fnloc, Cnloc, Cnglob, &C_id, &mat[0], - crystal, buf); - organize_matrix(&data->Q_AfP[lvl],&data->AfP[lvl], - Fnloc, Cnloc, Cnglob, &C_id, &mat[1], - crystal, buf); - organize_matrix(&data->Q_Aff[lvl],&data->Aff[lvl], - Fnloc, Fnloc, Fnglob, &F_id, &mat[2], - crystal, buf); - } - - free(gln); - free(perm_level); - for(m=0;m<3;++m) tuple_list_free(&mat[m]); - tuple_list_free(&C_id); - tuple_list_free(&F_id); -} - -static void read_main_data(amg_data *data, amg_file *f, buffer *buf) -{ - unsigned n; - if(data->pid==0) { - double t; - amg_fread(&t,1,f); - data->levels = t; - printf("AMG: %u levels\n", data->levels); - } -#ifdef MPI - MPI_Bcast(&data->levels,1,MPI_UNSIGNED,0,data->comm); -#endif - n = data->levels-1; - data->cheb_m = tmalloc(unsigned,n); - data->cheb_rho = tmalloc(real ,n); - if(data->pid==0) { - double *t; unsigned i; - buffer_reserve(buf,(2*n+1)*sizeof(double)), t=buf->ptr; - amg_fread(t,2*n+1,f); - for(i=0;icheb_m[i] = *t++; - for(i=0;icheb_rho[i] = *t++; - data->tn = *t++; - } -#ifdef MPI - MPI_Bcast(data->cheb_m ,n,MPI_UNSIGNED,0,data->comm); - MPI_Bcast(data->cheb_rho ,n,REAL_MPI ,0,data->comm); - MPI_Bcast(&data->tn ,1,REAL_MPI ,0,data->comm); -#endif - if(data->pid==0) { unsigned i; - printf("AMG Chebyshev smoother data:\n"); - for(i=0;icheb_m[i], (double)data->cheb_rho[i]); - printf("AMG: %g rows\n", data->tn); - } -} - -#ifndef AMG_BLOCK_ROWS -# define AMG_BLOCK_ROWS 1200 -#endif - -static void read_all_files(amg_data *data, tuple_list *cdof, - crystal_data *crystal, buffer* buf) -{ - static double row_data[AMG_BLOCK_ROWS][6]; - ulong ntot, r; unsigned passes; - uint cdof_ib=0, cdof_ie, cdof_n=cdof->n; - tuple_list dof_owner, dof_data; - const int dof_data_mi=5, - dof_data_proc=0, - dof_data_level=1, - dof_data_mat_len=2; - amg_file file_main, file_mat[3]; - struct { uint *row_len; size_t size, size_old; buffer buf; } raw_mat[3]; - raw_mat[0].row_len = tmalloc(uint,3*(cdof_n+1)); - raw_mat[1].row_len = raw_mat[0].row_len + (cdof_n+1); - raw_mat[2].row_len = raw_mat[1].row_len + (cdof_n+1); - raw_mat[0].size=raw_mat[1].size=raw_mat[2].size=0; - buffer_init(&raw_mat[0].buf,1024); - buffer_init(&raw_mat[1].buf,1024); - buffer_init(&raw_mat[2].buf,1024); - if(data->pid==0) { - amg_fopen(&file_main,"amg.dat"); - amg_fopen(&file_mat[0],"amg_W.dat"); - amg_fopen(&file_mat[1],"amg_AfP.dat"); - amg_fopen(&file_mat[2],"amg_Aff.dat"); - } - read_main_data(data,&file_main,buf); - ntot = data->tn; - passes = (ntot + AMG_BLOCK_ROWS - 1)/AMG_BLOCK_ROWS; - tuple_list_init_max(&dof_owner,1,1,0,AMG_BLOCK_ROWS); - tuple_list_init_max(&dof_data ,dof_data_mi,1,1,AMG_BLOCK_ROWS); - for(r=0;rAMG_BLOCK_ROWS ? AMG_BLOCK_ROWS : (unsigned)ntot-r; - unsigned m, mat_size[3]; - ulong id_end; uint i; - if(data->pid==0) { - amg_fread(&row_data[0][0],nr*6,&file_main); - id_end = row_data[nr-1][0]; - printf("AMG: reading through row %g, pass %u/%u\n", - (double)id_end, (unsigned)(r/AMG_BLOCK_ROWS+1), passes); - } -#ifdef MPI - MPI_Bcast(&id_end,1,ULONG_MPI,0,data->comm); -#endif - for(i=cdof_ib;ivi[cdof_mi*i+cdof_proc]) break; - id = cdof->vl[i]; - if(id>id_end) break; - dof_owner.vl[i-cdof_ib] = id; - dof_owner.vi[i-cdof_ib] = 0; - } - cdof_ie = i; - dof_owner.n = cdof_ie-cdof_ib; -#ifdef MPI - transfer(1,&dof_owner,0,crystal); -#endif - if(data->pid==0) { - sint *outi=dof_data.vi, *ini=dof_owner.vi; - slong *outl=dof_data.vl, *inl=dof_owner.vl; - real *outr=dof_data.vr; - if(dof_owner.n!=nr) - failwith("AMG: nobody claimed ownership of some rows"); - dof_data.n=nr; - tuple_list_sort(&dof_owner,1,buf); - mat_size[0]=mat_size[1]=mat_size[2]=0; - for(i=0;ivl[cdof_ib+i]) - failwith("AMG: setup problem"); - cdof->vi[cdof_mi*(cdof_ib+i)+cdof_level] - = dof_data.vi[dof_data_mi*i+dof_data_level]-1; - cdof->vr[cdof_ib+i]=dof_data.vr[i]; - for(m=0;m<3;++m) { - unsigned l = dof_data.vi[dof_data_mi*i+dof_data_mat_len+m]; - raw_mat[m].size += 2*l*sizeof(double); - raw_mat[m].row_len[cdof_ib+i] = l; - } - } - for(m=0;m<3;++m) buffer_reserve(&raw_mat[m].buf,raw_mat[m].size); - if(data->pid==0) { - const char *mat_name[3] = {"W","AfP","Aff"}; - for(m=0;m<3;++m) { - double *ptr, *my_ptr = - (double*)((char*)raw_mat[m].buf.ptr+raw_mat[m].size_old); - buffer_reserve(buf,mat_size[m]*sizeof(double)), ptr=buf->ptr; - printf("AMG: reading %g MB of %s\n", - mat_size[m]*sizeof(double)/(1024*1024.0),mat_name[m]); - amg_fread(ptr,mat_size[m],&file_mat[m]); - for(i=0;icomm); - else -#endif - memcpy(my_ptr,ptr,2*len*sizeof(double)), my_ptr+=2*len; - ptr += 2*len; - } - } - } else { -#ifdef MPI - MPI_Status status; - for(m=0;m<3;++m) { - double *ptr = (double*)((char*)raw_mat[m].buf.ptr+raw_mat[m].size_old); - for(i=0;ivl[cdof_ib+i]; - MPI_Recv(ptr,2*len,MPI_DOUBLE,0,tag,data->comm,&status); - ptr += 2*len; - } - } -#endif - } - cdof_ib=cdof_ie; - } - tuple_list_free(&dof_owner); - tuple_list_free(&dof_data); - if(data->pid==0) { - amg_fclose(&file_main); - amg_fclose(&file_mat[0]); - amg_fclose(&file_mat[1]); - amg_fclose(&file_mat[2]); - } - { - uint *row_len[3] = { raw_mat[0].row_len, - raw_mat[1].row_len, - raw_mat[2].row_len }; - double *mat_data[3] = { raw_mat[0].buf.ptr, - raw_mat[1].buf.ptr, - raw_mat[2].buf.ptr }; - organize_data(data,cdof,row_len,mat_data,crystal,buf); - } - free(raw_mat[0].row_len); - buffer_free(&raw_mat[0].buf); - buffer_free(&raw_mat[1].buf); - buffer_free(&raw_mat[2].buf); -} - -static void separate_cdof(tuple_list *cdof, uint pid, buffer *buf) -{ - sint *proc = cdof->vi + cdof_proc; - sint *proc_end = proc + cdof_mi * cdof->n; - for(;proc!=proc_end;proc+=cdof_mi) *proc = (*proc==pid ? 0 : 1); - tuple_list_sort(cdof,cdof_proc,buf); -} - -amg_data *amg_setup(uint n, const ulong *id, - uint nz, const uint *Ai, const uint *Aj, const real *A, - uint null_space, crystal_data *crystal) -{ - amg_data *data = tmalloc(amg_data,1); - sint *perm; tuple_list cdof; - uint i, bufn, chbn; -#ifdef MPI - buffer *buf = &crystal->all->buf; -#else - buffer buf_static, *buf=&buf_static; - buffer_init(buf,1024); -#endif - -#ifdef MPI - MPI_Comm_dup(crystal->comm,&data->comm); - data->pid = crystal->id, data->np = crystal->num; -#else - data->pid = 0, data->np = 1; -#endif - - data->null_space = null_space; - - perm = tmalloc(sint,n); - setup_dofs(&cdof,perm, n,id, data->pid, crystal,buf); - separate_cdof(&cdof,data->pid,buf); - read_all_files(data,&cdof,crystal,buf); - - { -#ifdef MPI - jl_comm_t comm = {crystal->id,crystal->num,crystal->comm}; -#else - jl_comm_t comm = {0,1,0}; -#endif - data->gs_top = jl_gs_setup((const slong*)cdof.vl,cdof.n,&comm); - } - - bufn = 0, chbn=0; - for(i=0;ilevels-1;++i) { - uint fn = data->lvl_offset[i+1]-data->lvl_offset[i]; - if(fn>chbn) chbn=fn; - if(data->W[i].cn > bufn) bufn=data->W[i].cn; - if(data->AfP[i].cn > bufn) bufn=data->AfP[i].cn; - if(data->Aff[i].cn > bufn) bufn=data->Aff[i].cn; - } - data->b = tmalloc(real,2*cdof.n+3*chbn+bufn); - data->x = data->b + cdof.n; - data->c = data->x + cdof.n; - data->c_old = data->c + chbn; - data->r = data->c_old + chbn; - data->buf = data->r + chbn; - - tuple_list_sort(&cdof,cdof_mi,buf); - for(i=0;iun = n; - data->cn = cdof.n; - data->perm = perm; - - data->timing = tmalloc(double,6*(data->levels-1)); - for(i=0;i<6*(data->levels-1);++i) data->timing[i]=0; - data->timing_n = 0; - - tuple_list_free(&cdof); -#ifndef MPI - buffer_free(buf); -#endif - - if(data->pid==0) printf("AMG: initialized\n"), fflush(stdout); -#ifdef DIAGNOSTICS - check_amg_data(data); -#ifdef MPI - MPI_Barrier(data->comm); -#endif - if(data->pid==0) printf("AMG: sanity check complete\n"), fflush(stdout); -#endif - return data; -} - -void amg_free(amg_data *data) -{ - uint i,n; - free(data->perm); - free(data->cheb_m); - free(data->cheb_rho); - free(data->lvl_offset); - free(data->Dff); - free(data->b); - jl_gs_free(data->gs_top); - n = 3*(data->levels-1); - for(i=0;iQ_W[i].gs); */ - jl_gs_free(data->Q_W[i].jgs); - free(data->W[i].row_off); - free(data->W[i].pr); - } - free(data->Q_W); - free(data->W); - free(data->timing); - free(data); -} diff --git a/3rdParty/gslib.github/src/crs.h b/3rdParty/gslib.github/src/crs.h deleted file mode 100644 index e2d0d36f4..000000000 --- a/3rdParty/gslib.github/src/crs.h +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef CRS_H -#define CRS_H - -#if !defined(COMM_H) -#warning "crs.h" requires "comm.h" -#endif - -#define crs_setup PREFIXED_NAME(crs_setup) -#define crs_solve PREFIXED_NAME(crs_solve) -#define crs_stats PREFIXED_NAME(crs_stats) -#define crs_free PREFIXED_NAME(crs_free ) - -struct crs_data; - -struct crs_data *crs_setup( - uint n, const ulong *id, - uint nz, const uint *Ai, const uint *Aj, const double *A, - uint null_space, const struct comm *comm); -void crs_solve(double *x, struct crs_data *data, double *b); -void crs_stats(struct crs_data *data); -void crs_free(struct crs_data *data); - -#endif - diff --git a/3rdParty/gslib.github/src/crs_test.c b/3rdParty/gslib.github/src/crs_test.c deleted file mode 100644 index e5367d2b1..000000000 --- a/3rdParty/gslib.github/src/crs_test.c +++ /dev/null @@ -1,116 +0,0 @@ -#include -#include -#include -#include -#include -#include "c99.h" -#include "name.h" -#include "fail.h" -#include "types.h" -#include "mem.h" -#include "gs_defs.h" -#include "comm.h" -#include "gs.h" -#include "crs.h" - -void test(const struct comm *const comm) -{ - const double A[16] = { 2, -1, -1, 0, - -1, 2, 0, -1, - -1, 0, 2, -1, - 0, -1, -1, 2 }; - const uint Ai[16] = { 0, 0, 0, 0, - 1, 1, 1, 1, - 2, 2, 2, 2, - 3, 3, 3, 3 }, - Aj[16] = { 0, 1, 2, 3, - 0, 1, 2, 3, - 0, 1, 2, 3, - 0, 1, 2, 3 }; - ulong xid[4]; slong uid[4]; - double x[4]={1,1,1,1}, b[4], bmean; - uint i, w, gn, px, py; - - slong *xgid=0; double *xg=0; struct gs_data *gsh; - - struct crs_data *crs; - - w = ceil(sqrt(comm->np)); gn = (w+1)*(w+1); - - if(comm->id==0) printf("arranging procs in a %u x %u square\n", w, w); - - px = comm->id%w, py = comm->id/w; - b[0] = xid[0] = (w+1)*py +px+1; - b[1] = xid[1] = (w+1)*py +px+2; - b[2] = xid[2] = (w+1)*(py+1)+px+1; - b[3] = xid[3] = (w+1)*(py+1)+px+2; - - gn = comm_reduce_slong(comm, gs_max, (const slong*)&xid[3],1); - bmean = comm_reduce_double(comm, gs_add, b,4)/gn; - - gsh = gs_setup((const slong*)xid,4, comm,0,gs_crystal_router,0); - gs(x,gs_double,gs_add,0,gsh,0); - gs(b,gs_double,gs_add,0,gsh,0); - for(i=0;i<4;++i) b[i]=xid[i]-bmean/x[i]; - gs(b,gs_double,gs_add,0,gsh,0); - gs_free(gsh); - - gsh = gs_setup((const slong*)xid,4, comm,1,gs_crystal_router,0); - for(i=0;i<4;++i) uid[i]=comm->id; - gs(uid,gs_slong,gs_min,0,gsh,0); - gs_free(gsh); - for(i=0;i<4;++i) uid[i] = (uid[i]==comm->id?(slong)xid[i]:-(slong)xid[i]); - - if(comm->id==0) { - xgid = tmalloc(slong, gn); - xg = tmalloc(double,gn); - for(i=0;iid?uid:xgid,comm->id?4:gn, comm,0,gs_crystal_router,0); - - - if(comm->id==0) for(i=0;i<4;++i) xg[xid[i]-1]=b[i]; - gs(comm->id?b:xg,gs_double,gs_add, 0, gsh, 0); - if(comm->id==0) for(i=0;iid==0) for(i=0;i<4;++i) xg[xid[i]-1]=x[i]; - gs(comm->id?x:xg,gs_double,gs_add, 0, gsh, 0); - if(comm->id==0) for(i=0;iid==0) free(xg), free(xgid); -} - -int main(int narg, char* arg[]) -{ - comm_ext world; int np; - struct comm comm; -#ifdef MPI - MPI_Init(&narg,&arg); - world = MPI_COMM_WORLD; - MPI_Comm_size(world,&np); -#else - world=0, np=1; -#endif - - comm_init(&comm,world); - test(&comm); - comm_free(&comm); - -#ifdef MPI - MPI_Finalize(); -#endif - - return 0; -} - diff --git a/3rdParty/gslib.github/src/crystal2.c b/3rdParty/gslib.github/src/crystal2.c deleted file mode 100644 index d79fca5b7..000000000 --- a/3rdParty/gslib.github/src/crystal2.c +++ /dev/null @@ -1,159 +0,0 @@ -/*------------------------------------------------------------------------------ - - Crystal Router - - Accomplishes all-to-all communication in log P msgs per proc - The routine is low-level; the format of the input/output is an - array of integers, consisting of a sequence of messages with format: - - target proc - source proc - m - integer - integer - ... - integer (m integers in total) - - Before crystal_router is called, the source of each message should be - set to this proc id; upon return from crystal_router, the target of each - message will be this proc id. - - Usage: - - MPI_Comm comm = ... ; - crystal_data crystal; - - crystal_init(&crystal, comm); // initialize the data structure - // now crystal.id = this proc - // and crystal.num = num of procs - - // allocate space for at least MAX ints - buffer_reserve(&crystal->all->buf, MAX*sizeof(uint)); - - // fill up ((uint*)crystal->all->buf.ptr)[0 ... n-1] - // and set crystal->all->n - - crystal_router(&crystal); - - // incoming messages available as - // ((uint*)crystal->all->buf.ptr)[0 ... crystal->all->n-1] - - crystal_free(&crystal); // release acquired memory - - ----------------------------------------------------------------------------*/ - -#ifdef MPI - -#include -#include -#include -#include -#include - -#include "errmem.h" -#include "types.h" - -typedef struct { uint n; buffer buf; } crystal_buf; - -typedef struct { - crystal_buf buffers[3]; - crystal_buf *all, *keep, *send; - MPI_Comm comm; - uint num, id; -} crystal_data; - -#define crystal_free crystal_old_free - -void crystal_init(crystal_data *p, MPI_Comm comm) -{ - int num,id; - buffer_init(&p->buffers[0].buf,1024); - buffer_init(&p->buffers[1].buf,1024); - buffer_init(&p->buffers[2].buf,1024); - p->all=&p->buffers[0]; - p->keep=&p->buffers[1]; - p->send=&p->buffers[2]; - memcpy(&p->comm,&comm,sizeof(MPI_Comm)); - MPI_Comm_rank(comm,&id ); p->id =id ; - MPI_Comm_size(comm,&num); p->num=num; -} - -void crystal_free(crystal_data *p) -{ - buffer_free(&p->buffers[0].buf); - buffer_free(&p->buffers[1].buf); - buffer_free(&p->buffers[2].buf); -} - -static void crystal_partition(crystal_data *p, uint cutoff, - crystal_buf *lo, crystal_buf *hi) -{ - const uint *src = p->all->buf.ptr; - const uint *end = src+p->all->n; - uint *target, *lop, *hip; - lo->n=hi->n=0; - buffer_reserve(&lo->buf,p->all->n*sizeof(uint)); - buffer_reserve(&hi->buf,p->all->n*sizeof(uint)); - lop = lo->buf.ptr, hip = hi->buf.ptr; - while(src!=end) { - uint chunk_len = 3 + src[2]; - if(src[0]n+=chunk_len,lop+=chunk_len; - else target=hip,hi->n+=chunk_len,hip+=chunk_len; - memcpy(target,src,chunk_len*sizeof(uint)); - src+=chunk_len; - } -} - -static void crystal_send(crystal_data *p, uint target, int recvn) -{ - MPI_Request req[3]; - MPI_Status status[3]; - uint count[2]={0,0},sum,*recv[2]; - crystal_buf *t; - int i; - - for(i=0;icomm,&req[i+1]); - MPI_Isend(&p->send->n,1,UINT_MPI,target ,p->id ,p->comm,&req[ 0]); - MPI_Waitall(recvn+1,req,status); - sum = p->keep->n; - for(i=0;ikeep->buf,sum*sizeof(uint)); - recv[0]=p->keep->buf.ptr; - recv[0]+=p->keep->n; - recv[1]=recv[0]+count[0]; - p->keep->n=sum; - - MPI_Isend(p->send->buf.ptr,p->send->n*sizeof(uint), - MPI_UNSIGNED_CHAR,target,p->id,p->comm,&req[0]); - if(recvn) { - MPI_Irecv(recv[0],count[0]*sizeof(uint),MPI_UNSIGNED_CHAR, - target,target,p->comm,&req[1]); - if(recvn==2) - MPI_Irecv(recv[1],count[1]*sizeof(uint),MPI_UNSIGNED_CHAR, - target+1,target+1,p->comm,&req[2]); - } - MPI_Waitall(recvn+1,req,status); - - t=p->all,p->all=p->keep,p->keep=t; -} - -void crystal_router(crystal_data *p) -{ - uint bl=0, bh, n=p->num, nl, target; - int recvn; - crystal_buf *lo, *hi; - while(n>1) { - nl = n/2, bh = bl+nl; - if(p->idid+nl,recvn=(n&1 && p->id==bh-1)?2:1 ,lo=p->keep,hi=p->send; - else - target=p->id-nl,recvn=(target==bh)?(--target,0):1,hi=p->keep,lo=p->send; - crystal_partition(p,bh,lo,hi); - crystal_send(p,target,recvn); - if(p->id -#include -#include -#include "errmem.h" - -void eexit(void) { nek_exitt(); } /* exit wrapper */ - -void fail(const char *fmt, ...) -{ - va_list ap; - va_start(ap, fmt); - vfprintf(stderr, fmt, ap); - va_end(ap); - eexit(); -} - diff --git a/3rdParty/gslib.github/src/errmem.h b/3rdParty/gslib.github/src/errmem.h deleted file mode 100644 index 2665256d8..000000000 --- a/3rdParty/gslib.github/src/errmem.h +++ /dev/null @@ -1,79 +0,0 @@ -#include "fname.h" - -#ifndef ERRMEM_H -#define ERRMEM_H - -/* requires: - for malloc, calloc, realloc, free -*/ - -/*-------------------------------------------------------------------------- - Error Reporting - Memory Allocation Wrappers to Catch Out-of-memory - --------------------------------------------------------------------------*/ - -#ifdef __GNUC__ -void fail(const char *fmt, ...) __attribute__ ((noreturn)); -#else -void fail(const char *fmt, ...); -#endif - -static void failwith(const char *string) -{ - fail("%s\n",string); -} - -static void *smalloc(size_t size, const char *file) -{ - void *res = malloc(size); - if(!res && size) fail("%s: allocation of %d bytes failed\n",file,(int)size); - return res; -} - -static void *scalloc(size_t nmemb, size_t size, const char *file) -{ - void *res = calloc(nmemb, size); - if(!res && nmemb) - fail("%s: allocation of %d bytes failed\n",file,(int)size*nmemb); - return res; -} - -static void *srealloc(void *ptr, size_t size, const char *file) -{ - void *res = realloc(ptr, size); - if(!res && size) fail("%s: allocation of %d bytes failed\n",file,(int)size); - return res; -} - -#define tmalloc(type, count) \ - ((type*) smalloc((count)*sizeof(type),__FILE__) ) -#define tcalloc(type, count) \ - ((type*) scalloc((count),sizeof(type),__FILE__) ) -#define trealloc(type, ptr, count) \ - ((type*) srealloc((ptr),(count)*sizeof(type),__FILE__) ) - -typedef struct { size_t size; void *ptr; } buffer; -static void buffer_init_(buffer *b, size_t size, const char *file) -{ - b->size=size, b->ptr=smalloc(size,file); -} -static void buffer_reserve_(buffer *b, size_t min, const char *file) -{ - size_t size = b->size; - if(sizeptr=srealloc(b->ptr,size,file); - b->size=size; - } -} -static void buffer_free(buffer *b) { free(b->ptr); } - -#define buffer_init(b,size) buffer_init_(b,size,__FILE__) -#define buffer_reserve(b,min) buffer_reserve_(b,min,__FILE__) - -#endif - -#define nek_exitt FORTRAN_NAME(exitt,EXITT) -void nek_exitt(void); -void eexit(void); diff --git a/3rdParty/gslib.github/src/ext/findpts_local.h b/3rdParty/gslib.github/src/ext/findpts_local.h deleted file mode 100644 index 2c9634ef2..000000000 --- a/3rdParty/gslib.github/src/ext/findpts_local.h +++ /dev/null @@ -1,47 +0,0 @@ -#ifndef NEK_FINDPTS_LOCAL_H -#define NEK_FINDPTS_LOCAL_H - -#include "nek_config.h" - -#define TOKEN_PASTE_(a,b) a##b -#define TOKEN_PASTE(a,b) TOKEN_PASTE_(a,b) - -#ifdef NEK_FUN_PREFIX -# define NEK_PREFIXED_NAME(x) TOKEN_PASTE(NEK_FUN_PREFIX,x) -#else -# define NEK_PREFIXED_NAME(x) x -#endif - - -#define findpts_local_setup NEK_PREFIXED_NAME(findpts_local_setup) -#define findpts_local_free NEK_PREFIXED_NAME(findpts_local_free ) -#define findpts_local NEK_PREFIXED_NAME(findpts_local ) -#define findpts_local_eval NEK_PREFIXED_NAME(findpts_local_eval ) - - -struct findpts_local_data; - -struct findpts_local_data *findpts_local_setup( - const unsigned dim, - const double *const elx[], const unsigned n[], const nek_uint nel, - const unsigned m[], const double bbox_tol, const nek_uint max_hash_size, - const nek_uint npt_max, const double newt_tol); - -void findpts_local_free(struct findpts_local_data *p); - -void findpts_local( - nek_uint *const code_base , const unsigned code_stride , - nek_uint *const el_base , const unsigned el_stride , - double *const r_base , const unsigned r_stride , - double *const dist2_base , const unsigned dist2_stride , - const double *const x_base[], const unsigned x_stride[], - const nek_uint npt, struct findpts_local_data *const p); - -void findpts_local_eval( - double *const out_base, const unsigned out_stride, - nek_uint *const el_base, const unsigned el_stride, - const double *const r_base, const unsigned r_stride, - const nek_uint npt, - const double *const in, struct findpts_local_data *const p); - -#endif diff --git a/3rdParty/gslib.github/src/ext/findpts_local_ext.c b/3rdParty/gslib.github/src/ext/findpts_local_ext.c deleted file mode 100644 index 59344729a..000000000 --- a/3rdParty/gslib.github/src/ext/findpts_local_ext.c +++ /dev/null @@ -1,96 +0,0 @@ -#include -#include -#include -#include -#include -#include "../c99.h" -#include "../name.h" -#include "../fail.h" -#include "../mem.h" -#include "../types.h" -#include "../poly.h" -#include "../obbox.h" -#include "../findpts_el.h" -#include "../findpts_local.h" - -#define findpts_local_setup PREFIXED_NAME(findpts_local_setup) -#define findpts_local_free PREFIXED_NAME(findpts_local_free ) -#define findpts_local PREFIXED_NAME(findpts_local ) -#define findpts_local_eval PREFIXED_NAME(findpts_local_eval ) - -struct findpts_local_data { - unsigned dim; /* 2 or 3 */ - buffer buf; - union { - struct findpts_local_data_2 d2; - struct findpts_local_data_3 d3; - } fld; -}; - -struct findpts_local_data *findpts_local_setup( - const unsigned dim, - const double *const elx[], const unsigned n[], const uint nel, - const unsigned m[], const double bbox_tol, const uint max_hash_size, - const uint npt_max, const double newt_tol) -{ - struct findpts_local_data *p = tmalloc(struct findpts_local_data, 1); - - p->dim = dim; - memset(&p->buf,0,sizeof(buffer)); - if(dim==2) - findpts_local_setup_2(&p->fld.d2, elx,n,nel, m, bbox_tol, - max_hash_size, npt_max, newt_tol); - else - findpts_local_setup_3(&p->fld.d3, elx,n,nel, m, bbox_tol, - max_hash_size, npt_max, newt_tol); - - return p; -} - -void findpts_local_free(struct findpts_local_data *p) -{ - buffer_free(&p->buf); - if(p->dim==2) findpts_local_free_2(&p->fld.d2); - else findpts_local_free_3(&p->fld.d3); - free(p); -} - -void findpts_local( - uint *const code_base , const unsigned code_stride , - uint *const el_base , const unsigned el_stride , - double *const r_base , const unsigned r_stride , - double *const dist2_base , const unsigned dist2_stride , - const double *const x_base[], const unsigned x_stride[], - const uint npt, struct findpts_local_data *const p) -{ - if(p->dim==2) findpts_local_2(code_base, code_stride, - el_base, el_stride, - r_base, r_stride, - dist2_base,dist2_stride, - x_base, x_stride, - npt, &p->fld.d2, &p->buf); - else findpts_local_3(code_base, code_stride, - el_base, el_stride, - r_base, r_stride, - dist2_base,dist2_stride, - x_base, x_stride, - npt, &p->fld.d3, &p->buf); -} - -void findpts_local_eval( - double *const out_base, const unsigned out_stride, - const uint *const el_base, const unsigned el_stride, - const double *const r_base, const unsigned r_stride, - const uint npt, - const double *const in, struct findpts_local_data *const p) -{ - if(p->dim==2) findpts_local_eval_2(out_base, out_stride, - el_base, el_stride, - r_base, r_stride, - npt, in, &p->fld.d2); - else findpts_local_eval_3(out_base, out_stride, - el_base, el_stride, - r_base, r_stride, - npt, in, &p->fld.d3); -} - diff --git a/3rdParty/gslib.github/src/ext/nek_config.h.in b/3rdParty/gslib.github/src/ext/nek_config.h.in deleted file mode 100644 index e481d1f0c..000000000 --- a/3rdParty/gslib.github/src/ext/nek_config.h.in +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef NEK_CONFIG_H -#define NEK_CONFIG_H - -#define NEK_FUN_PREFIX @FUN_PREFIX@ - -typedef @NEK_UINT@ nek_uint; - -#endif diff --git a/3rdParty/gslib.github/src/fcrs.c b/3rdParty/gslib.github/src/fcrs.c deleted file mode 100644 index afa337a8b..000000000 --- a/3rdParty/gslib.github/src/fcrs.c +++ /dev/null @@ -1,76 +0,0 @@ -#include -#include -#include -#include -#include "c99.h" -#include "name.h" -#include "fail.h" -#include "types.h" -#include "mem.h" -#include "comm.h" -#include "crs.h" - -/*-------------------------------------------------------------------------- - FORTRAN Interface to coarse solver - --------------------------------------------------------------------------*/ - -#undef crs_setup -#undef crs_solve -#undef crs_stats -#undef crs_free -#define ccrs_setup PREFIXED_NAME(crs_setup) -#define ccrs_solve PREFIXED_NAME(crs_solve) -#define ccrs_stats PREFIXED_NAME(crs_stats) -#define ccrs_free PREFIXED_NAME(crs_free ) - -#define fcrs_setup FORTRAN_NAME(crs_setup,CRS_SETUP) -#define fcrs_solve FORTRAN_NAME(crs_solve,CRS_SOLVE) -#define fcrs_stats FORTRAN_NAME(crs_stats,CRS_STATS) -#define fcrs_free FORTRAN_NAME(crs_free ,CRS_FREE) - -static struct crs_data **handle_array = 0; -static int handle_max = 0; -static int handle_n = 0; - -void fcrs_setup(sint *handle, const MPI_Fint *comm, const sint *np, - const sint *n, const slong id[], const sint *nz, - const sint Ai[], const sint Aj[], const double A[], - const sint *null_space) -{ - struct comm c; - if(handle_n==handle_max) - handle_max+=handle_max/2+1, - handle_array=trealloc(struct crs_data*,handle_array,handle_max); - comm_init_check(&c, *comm, *np); - handle_array[handle_n]=ccrs_setup(*n,(const ulong*)id, - *nz,(const uint*)Ai,(const uint*)Aj,A, - *null_space,&c); - comm_free(&c); - *handle = handle_n++; -} - -#define CHECK_HANDLE(func) do \ - if(*handle<0 || *handle>=handle_n || !handle_array[*handle]) \ - fail(1,__FILE__,__LINE__,func ": invalid handle"); \ -while(0) - -void fcrs_solve(const sint *handle, double x[], double b[]) -{ - CHECK_HANDLE("crs_solve"); - ccrs_solve(x,handle_array[*handle],b); -} - -void fcrs_stats(const sint *handle) -{ - CHECK_HANDLE("crs_stats"); - ccrs_stats(handle_array[*handle]); -} - -void fcrs_free(sint *handle) -{ - CHECK_HANDLE("crs_free"); - ccrs_free(handle_array[*handle]); - handle_array[*handle] = 0; -} - - diff --git a/3rdParty/gslib.github/src/fcrystal2.c b/3rdParty/gslib.github/src/fcrystal2.c deleted file mode 100644 index ca1e62825..000000000 --- a/3rdParty/gslib.github/src/fcrystal2.c +++ /dev/null @@ -1,110 +0,0 @@ -/*------------------------------------------------------------------------------ - - FORTRAN interface for crystal router - - integer h, np - MPI_Comm comm - call crystal_new(h,comm,np) ! set h to handle to new instance - ! it is a runtime error if MPI_Comm_size gives a value different than np - call crystal_done(h) ! release instance - - integer*? vi(mi,max) ! these integer and real types - integer*? vl(ml,max) ! better match up with what is - real vr(mr,max) ! in "types.h" - call crystal_transfer(h,n,max,vi,mi,vl,ml,vr,mr,p) - - - this treats { vi(:,i), vl(:,i), vr(:,i) } , i in [1 ... n] - as a list of n tuples with mi integers and md reals each - - the parameter p indicates that the tuple - { vi(:,i), vl(:,i), vr(:,i) } should be sent to proc vi(p,i), - and that on return, vi(p,j) will be the source proc of tuple j - - n will be set to the number of tuples that came in - - if more tuples come in than max, n will be set to max+1, - although only max tuples were stored (the rest are lost) - - ----------------------------------------------------------------------------*/ - -#include -#include -#include -#ifdef MPI -# include -#endif - -#include "fname.h" -#include "errmem.h" -#include "types.h" -#ifdef MPI -# include "crystal.h" -# include "tuple_list.h" -# include "transfer.h" -#else - typedef void MPI_Comm; -#endif - -#define fcrystal_new FORTRAN_NAME(crystal_new,CRYSTAL_NEW) -#define fcrystal_done FORTRAN_NAME(crystal_done,CRYSTAL_DONE) -#define fcrystal_transfer FORTRAN_NAME(crystal_transfer,CRYSTAL_TRANSFER) - -#ifdef MPI - static crystal_data **handle=0; - static int n=0, max=0; -#else - typedef int MPI_Fint; -#endif - -void fcrystal_new(sint *h, const MPI_Fint *comm, const sint *np) -{ -#ifdef MPI - MPI_Comm local_com; - if(n==max) max+=max/2+1,handle=trealloc(crystal_data*,handle,max); - handle[n] = tmalloc(crystal_data,1); - - MPI_Comm_dup(MPI_Comm_f2c(*comm),&local_com); - - crystal_init(handle[n],local_com); - if(*np!=(sint)handle[n]->num) - fail("crystal_new: passed P=%d, but MPI_Comm_size gives P=%d\n", - *np,handle[n]->num); - *h=n++; -#else - if(*np!=1) - fail("crystal_new: passed P=%d, but not compiled with -DMPI\n",*np); - *h=-1; -#endif -} - -#ifdef MPI -crystal_data *fcrystal_handle(sint h) -{ - if(h<0 || h>=n || handle[h]==0) failwith("invalid crystal router handle"); - return handle[h]; -} -#endif - -void fcrystal_done(sint *h) -{ -#ifdef MPI - crystal_data *p = fcrystal_handle(*h); - handle[*h]=0; - MPI_Comm_free(&p->comm); - crystal_free(p); - free(p); -#endif -} - -void fcrystal_transfer(const sint *h, sint *n, const sint *max, - sint vi[], const sint *mi, - slong vl[], const sint *ml, - real vr[], const sint *mr, - const sint *pp) -{ -#ifdef MPI - crystal_data *crystal = fcrystal_handle(*h); - tuple_list tl = { *mi, *ml, *mr, *n, *max, vi, vl, vr }; - const sint p = *pp-1; /* switch to 0-based index */ - transfer(0,&tl,p,crystal); - *n = tl.n; -#endif -} - diff --git a/3rdParty/gslib.github/src/fcrystal3.c b/3rdParty/gslib.github/src/fcrystal3.c deleted file mode 100644 index 1083eefca..000000000 --- a/3rdParty/gslib.github/src/fcrystal3.c +++ /dev/null @@ -1,111 +0,0 @@ -/*------------------------------------------------------------------------------ - - FORTRAN interface for crystal router - - integer h, np - MPI_Comm comm - call crystal_new(h,comm,np) ! set h to handle to new instance - ! it is a runtime error if MPI_Comm_size gives a value different than np - call crystal_done(h) ! release instance - - integer*? vi(mi,max) ! these integer and real types - integer*? vl(ml,max) ! better match up with what is - real vr(mr,max) ! in "types.h" - call crystal_transfer(h,n,max,vi,mi,vl,ml,vr,mr,p) - - - this treats { vi(:,i), vl(:,i), vr(:,i) } , i in [1 ... n] - as a list of n tuples with mi integers and md reals each - - the parameter p indicates that the tuple - { vi(:,i), vl(:,i), vr(:,i) } should be sent to proc vi(p,i), - and that on return, vi(p,j) will be the source proc of tuple j - - n will be set to the number of tuples that came in - - if more tuples come in than max, n will be set to max+1, - although only max tuples were stored (the rest are lost) - - ----------------------------------------------------------------------------*/ - -#include -#include -#include -#ifdef MPI -# include -#endif - -#include "fname.h" -#include "errmem.h" -#include "types.h" -#ifdef MPI -# include "crystal.h" -# include "tuple_list.h" -# include "transfer.h" -#else - typedef void MPI_Comm; -#endif - -#define fcrystal_new FORTRAN_NAME(crystal_new,CRYSTAL_NEW) -#define fcrystal_done FORTRAN_NAME(crystal_done,CRYSTAL_DONE) -#define fcrystal_transfer FORTRAN_NAME(crystal_transfer,CRYSTAL_TRANSFER) - -#ifdef MPI - static crystal_data **handle=0; - static int n=0, max=0; -#else - typedef int MPI_Fint; -#endif - -void fcrystal_new(sint *h, const MPI_Fint *comm, const sint *np) -{ -#ifdef MPI - MPI_Comm local_com; - if(n==max) max+=max/2+1,handle=trealloc(crystal_data*,handle,max); - handle[n] = tmalloc(crystal_data,1); - - MPI_Comm_dup(MPI_Comm_f2c(*comm),&local_com); - - crystal_init(handle[n],local_com); - if(*np!=(sint)handle[n]->num) - fail("crystal_new: passed P=%d, but MPI_Comm_size gives P=%d\n", - *np,handle[n]->num); - *h=n++; -#else - if(*np!=1) - fail("crystal_new: passed P=%d, but not compiled with -DMPI\n",*np); - *h=-1; -#endif -} - -#ifdef MPI -crystal_data *fcrystal_handle(sint h) -{ - if(h<0 || h>=n || handle[h]==0) failwith("invalid crystal router handle"); - return handle[h]; -} -#endif - -void fcrystal_done(sint *h) -{ -#ifdef MPI - crystal_data *p = fcrystal_handle(*h); - handle[*h]=0; - MPI_Comm_free(&p->comm); - crystal_free(p); - free(p); -#endif -} - -/* real or float or double vr[] ??? FIXEME misun 8/8/2014*/ -void fcrystal_transfer(const sint *h, sint *n, const sint *max, - sint vi[], const sint *mi, - slong vl[], const sint *ml, - float vr[], const sint *mr, - const sint *pp) -{ -#ifdef MPI - crystal_data *crystal = fcrystal_handle(*h); - tuple_list tl = { *mi, *ml, *mr, *n, *max, vi, vl, vr }; - const sint p = *pp-1; /* switch to 0-based index */ - transfer(0,&tl,p,crystal); - *n = tl.n; -#endif -} - diff --git a/3rdParty/gslib.github/src/findpt.c b/3rdParty/gslib.github/src/findpt.c deleted file mode 100644 index 00cd73441..000000000 --- a/3rdParty/gslib.github/src/findpt.c +++ /dev/null @@ -1,2245 +0,0 @@ -#include -#include -#include -#include /* for cos, fabs */ -#include -#include /* for memcpy */ - -#include "errmem.h" -#include "types.h" -#include "minmax.h" -#include "poly.h" -#include "tensor.h" - -/*-------------------------------------------------------------------------- - Lobatto Polynomial Bounds - - Needed inputs are the Gauss-Lobatto quadrature nodes and weights: - unsigned nr = ..., ns = ...; - real zr[nr], wr[nr]; - real zs[ns], ws[ns]; - - lobatto_nodes(zr,nr); lobatto_weights(zr,wr,nr); - lobatto_nodes(zs,ns); lobatto_weights(zs,ws,ns); - - The number of points in the constructed piecewise (bi-)linear bounds - is a parameter; more points give tighter bounds - - unsigned mr = 2*nr, ms = 2*ns; - - The necessary setup is accomplished via: - lob_bnd_base b_data_r; - lob_bnd_ext e_data_s; - - lob_bnd_base_alloc(&b_data_r,nr,mr); - lob_bnd_base_setup(&b_data_r,zr,wr); - lob_bnd_ext_alloc(&e_data_s,ns,ms); - lob_bnd_ext_setup(&e_data_s,zs,ws); - - Bounds may then be computed via: - real work1r[2*mr], work1s[2*ms], work2[2*mr + 2*mr*ns + 2*mr*ms]; - real ur[nr], us[ns]; // 1-d polynomials on the zr[] and zs[] nodes - real u[ns][nr]; // 2-d polynomial on zr[] (x) zs[] - real bound[2]; // = { min, max } (to be computed) - - lob_bnd_1(&b_data_r ,ur,bound,work1r); // compute bounds on ur - lob_bnd_1(&e_data_s.b,us,bound,work1s); // compute bounds on us - lob_bnd_2(&b_data_r, &e_data_s, - (const double*)&u[0][0],bound,work2); // compute bounds on u - The above routines access the zr,zs arrays passed to *_setup - (so do not delete them between calls) - - Memory allocated in *_setup is freed with - lob_bnd_base_free(&b_data_r); - lob_bnd_ext_free(&e_data_s); - - --------------------------------------------------------------------------*/ - -typedef struct { - unsigned n; /* number of Lobatto nodes in input */ - unsigned m; /* number of Chebyshev nodes used to calculate bounds */ - real *Q0, *Q1; /* Q0[n], Q1[n] -- first two rows of change of basis matrix - from Lobatto node Lagrangian to Legendre */ - const real *z; /* z[n] -- external; Lobatto nodes */ - real *h; /* h[m] -- Chebyshev nodes */ - real *uv, *ov; /* uv[n][m], ov[n][m] -- - uv[j][:] is a piecewise linear function in the nodal - basis with nodes h[m] that is everywhere less - than or equal to the jth Lagrangian basis - function (with the Lobatto nodes z[n]) - ov[j][:] is everywhere greater than or equal */ -} lob_bnd_base; - -typedef struct { - lob_bnd_base b; - real *uvp, *uvn, *ovp, *ovn; /* [n][m] -- uv and ov split into - positive and negative parts */ -} lob_bnd_ext; - -static void lob_bnd_base_alloc(lob_bnd_base *p, unsigned n, unsigned m) -{ - p->n = n, p->m = m; - p->Q0 = tmalloc(real,2*n+m+2*n*m); - p->Q1 = p->Q0+n; - p->h = p->Q1+n; - p->uv = p->h +m; - p->ov = p->uv+n*m; -} - -static void lob_bnd_base_free(lob_bnd_base *p) -{ - free(p->Q0); -} - -static void lob_bnd_ext_alloc(lob_bnd_ext *p, unsigned n, unsigned m) -{ - p->b.n = n, p->b.m = m; - p->b.Q0 = tmalloc(real,2*n+m+6*n*m); - p->b.Q1 = p->b.Q0+n; - p->b.h = p->b.Q1+n; - p->b.uv = p->b.h +m; - p->b.ov = p->b.uv+n*m; - p->uvp = p->b.ov+n*m; - p->uvn = p->uvp +n*m; - p->ovp = p->uvn +n*m; - p->ovn = p->ovp +n*m; -} - -static void lob_bnd_ext_free(lob_bnd_ext *p) -{ - free(p->b.Q0); -} - -static void lob_bnd_base_setup(lob_bnd_base *p, const real *z, const real *w) -{ - unsigned i,j,m=p->m,n=p->n,mm=2*m-1; - real *q = tmalloc(real,(2*n+1)*mm+6*n), - *J = q+mm, *D = J+n*mm, *work = D+n*mm; - p->z = z; - for(i=0;iQ0[i]=w[i]/2, p->Q1[i] = 3*p->Q0[i]*z[i]; - p->h[0] = -1, p->h[m-1] = 1; - for(j=1;jh[j] = cosr((m-j-1)*PI/(m-1)); - for(j=0;jh[j], q[2*j+1] = (p->h[j]+p->h[j+1])/2; - q[mm-1] = p->h[m-1]; - lagrange_weights_deriv(z,n,q,mm,J,D,work); - for(i=0;iuv+i*m, *ov = p->ov+i*m; - ov[0] = uv[0] = J[i]; - ov[m-1] = uv[m-1] = J[(mm-1)*n+i]; - for(j=1;jb.m * p->b.n; - lob_bnd_base_setup(&p->b,z,w); - for(i=0;ib.uv[i], ovi = p->b.ov[i]; - p->uvp[i] = p->uvn[i] = p->ovp[i] = p->ovn[i] = 0; - if(uvi > 0) p->uvp[i]=uvi; else p->uvn[i]=uvi; - if(ovi > 0) p->ovp[i]=ovi; else p->ovn[i]=ovi; - } -} - -static void lob_bnd_lines(const lob_bnd_base *p, const real *u, - real *a, real *b) -{ - unsigned i,j; - real a0=0, a1=0; - const real *uv = p->uv, *ov = p->ov; - for(i=0;in;++i) a0 += p->Q0[i]*u[i], a1 += p->Q1[i]*u[i]; - for(j=0;jm;++j) b[j] = a[j] = a0 + a1*p->h[j]; - for(i=0;in;++i) { - real w = u[i] - (a0 + a1*p->z[i]); - if(w>=0) - for(j=0;jm;++j) a[j]+=w*(*uv++), b[j]+=w*(*ov++); - else - for(j=0;jm;++j) a[j]+=w*(*ov++), b[j]+=w*(*uv++); - } -} - -/* work holds p->m * 2 doubles */ -static void lob_bnd_1(const lob_bnd_base *p, const real *u, real bnd[2], - real *work) -{ - unsigned j; - real *a = work, *b = work+p->m; - lob_bnd_lines(p,u,a,b); - bnd[0] = a[0], bnd[1] = b[0]; - for(j=1;jm;++j) { - if(a[j]bnd[1]) bnd[1]=b[j]; - } -} - -/* work holds 2*mr + 2*mr*ns + 2*mr*ms doubles */ -static void lob_bnd_2(const lob_bnd_base *pr, const lob_bnd_ext *ps, - const real *u, real bnd[2], real *work) -{ - unsigned nr = pr->n, mr = pr->m, ns = ps->b.n, ms = ps->b.m; - real *a0 = work, *a1 = a0+mr, - *ar_= a1+mr, *ar=ar_, - *br_= ar+mr*ns, *br=br_, - *a_ = br+mr*ns, *a =a_, - *b_ = a +mr*ms, *b =b_, - *uvp,*ovp,*uvn,*ovn; - real b0,b1; - unsigned i,j,k; - for(i=0;ib.Q0[j], q1 = ps->b.Q1[j]; - lob_bnd_lines(pr,u,ar,br); - for(i=0;ib.h[k]; - } - ar = ar_, br = br_; - uvp=ps->uvp, ovp=ps->ovp, uvn=ps->uvn, ovn=ps->ovn; - for(j=0;jb.z[j]; - a = a_, b = b_; - for(i=0;i=0) /* 0 <= uw <= ow */ - for(k=0;kb1) b1=b_[i]; - } - bnd[0] = b0, bnd[1] = b1; -} - -/*-------------------------------------------------------------------------- - Small Matrix Inverse - --------------------------------------------------------------------------*/ - -static void mat_inv_2(const real A[4], real inv[4]) -{ - const real idet = 1/(A[0]*A[3]-A[1]*A[2]); - inv[0] = idet*A[3]; - inv[1] = -(idet*A[1]); - inv[2] = -(idet*A[2]); - inv[3] = idet*A[0]; -} - -static void mat_inv_3(const real A[9], real inv[9]) -{ - const real a = A[4]*A[8]-A[5]*A[7], - b = A[5]*A[6]-A[3]*A[8], - c = A[3]*A[7]-A[4]*A[6], - idet = 1/(A[0]*a+A[1]*b+A[2]*c); - inv[0] = idet*a; - inv[1] = idet*(A[2]*A[7]-A[1]*A[8]); - inv[2] = idet*(A[1]*A[5]-A[2]*A[4]); - inv[3] = idet*b; - inv[4] = idet*(A[0]*A[8]-A[2]*A[6]); - inv[5] = idet*(A[2]*A[3]-A[0]*A[5]); - inv[6] = idet*c; - inv[7] = idet*(A[1]*A[6]-A[0]*A[7]); - inv[8] = idet*(A[0]*A[4]-A[1]*A[3]); -} - -static void mat_app_2r(real y[2], const real A[4], const real x[2]) -{ - y[0] = A[0]*x[0] + A[1]*x[1]; - y[1] = A[2]*x[0] + A[3]*x[1]; -} - -static void mat_app_2c(real y[2], const real A[4], const real x[2]) -{ - y[0] = A[0]*x[0] + A[2]*x[1]; - y[1] = A[1]*x[0] + A[3]*x[1]; -} - -static void mat_app_3r(real y[3], const real A[9], const real x[3]) -{ - y[0] = A[0]*x[0] + A[1]*x[1] + A[2]*x[2]; - y[1] = A[3]*x[0] + A[4]*x[1] + A[5]*x[2]; - y[2] = A[6]*x[0] + A[7]*x[1] + A[8]*x[2]; -} - -static void mat_app_3c(real y[3], const real A[9], const real x[3]) -{ - y[0] = A[0]*x[0] + A[3]*x[1] + A[6]*x[2]; - y[1] = A[1]*x[0] + A[4]*x[1] + A[7]*x[2]; - y[2] = A[2]*x[0] + A[5]*x[1] + A[8]*x[2]; -} - -static void tinyla_solve_2(real x[2], const real A[4], const real b[2]) -{ - real inv[4]; - mat_inv_2(A,inv); - mat_app_2r(x,inv,b); -} - -static void tinyla_solve_3(real x[3], const real A[9], const real b[3]) -{ - real inv[9]; - mat_inv_3(A,inv); - mat_app_3r(x,inv,b); -} - -/* solve - A[0] x0 + A[2] x1 = b0, - A[2] x0 + A[1] x1 = b1 -*/ -static void tinyla_solve_sym_2(real *x0, real *x1, const real A[3], - real b0, real b1) -{ - const real idet = 1/(A[0]*A[1] - A[2]*A[2]); - *x0 = idet * (A[1]*b0 - A[2]*b1); - *x1 = idet * (A[0]*b1 - A[2]*b0); -} - -/*-------------------------------------------------------------------------- - Oriented Bounding Box - - Suffixes on names are _2 for 2-d and _3 for 3-d - - Needed inputs are the Gauss-Lobatto quadrature nodes and weights: - unsigned nr = ..., ns = ...; - real zr[nr], wr[nr]; - real zs[ns], ws[ns]; - - lobatto_nodes(zr,nr); lobatto_weights(zr,wr,nr); - lobatto_nodes(zs,ns); lobatto_weights(zs,ws,ns); - - The number of points in the constructed piecewise (bi-)linear bounds - for the boundaries is a parameter; more points give tighter bounds - - unsigned mr = 2*nr, ms = 2*ns; - - Bounding boxes are increased by a relative amount as a parameter - - real tol = 0.01; - - Setup is accomplished via: - - const real *z[2] = {zr,zs}, *w[2] = {wr,ws}; - const unsigned n[2] = {nr,ns}, m[2] = {mr,ms}; - obbox_data_2 *data = obbox_setup_2(z,w,n,m); - - Bounding box data may then be computed: - - obbox_2 box; // will store bounding box information - real xm[ns][nr], ym[ns][nr]; // x, y coordinates of the element nodes - - obbox_calc_2(data, tol, (const real *)&xm[0][0], - (const real *)&ym[0][0], &box); - - A point may be tested: - - const real x[2]; // point to test - real r[2]; - - if( obbox_axis_test_2(&box, x) ) - ... // x failed axis-aligned bounding box test - - if( obbox_test_2(&box, x, r) ) - ... // x failed oriented bounding box test - else - ... // r suitable as initial guess for parametric coords - - Once all bounding box information has been computed - - obbox_free_2(data); - - to free the memory allocated with obbox_setup_2. - - --------------------------------------------------------------------------*/ - -typedef struct { - lob_bnd_base dr, ds; - real *Jr0, *Dr0, *Js0, *Ds0, *work; -} obbox_data_2; - -typedef struct { - lob_bnd_base dr; - lob_bnd_ext ds, dt; - real *Jr0, *Dr0, *Js0, *Ds0, *Jt0, *Dt0, *work; -} obbox_data_3; - -static void obbox_data_alloc_2(obbox_data_2 *p, - const unsigned n[2], const unsigned m[2]) -{ - const unsigned max_npm = umax_2(n[0]+m[0],n[1]+m[1]); - lob_bnd_base_alloc(&p->dr, n[0], m[0]); - lob_bnd_base_alloc(&p->ds, n[1], m[1]); - p->Jr0 = tmalloc(real,2*n[0]+2*n[1]+2*max_npm); - p->Dr0 = p->Jr0 + n[0]; - p->Js0 = p->Dr0 + n[0]; - p->Ds0 = p->Js0 + n[1]; - p->work = p->Ds0 + n[1]; -} - -static void obbox_data_free_2(obbox_data_2 *p) -{ - lob_bnd_base_free(&p->dr); - lob_bnd_base_free(&p->ds); - free(p->Jr0); -} - -static void obbox_data_alloc_3(obbox_data_3 *p, - const unsigned n[3], const unsigned m[3]) -{ - const unsigned wk1 = 3*n[0]*n[1] + 2*m[0] + 2*m[0]*n[1] + 2*m[0]*m[1]; - const unsigned wk2 = 3*n[0]*n[2] + 2*m[0] + 2*m[0]*n[2] + 2*m[0]*m[2]; - const unsigned wk3 = 3*n[1]*n[2] + 2*m[1] + 2*m[1]*n[2] + 2*m[1]*m[2]; - const unsigned wk_max = umax_3(wk1,wk2,wk3); - lob_bnd_base_alloc(&p->dr, n[0], m[0]); - lob_bnd_ext_alloc(&p->ds, n[1], m[1]); - lob_bnd_ext_alloc(&p->dt, n[2], m[2]); - p->Jr0 = tmalloc(real,2*n[0]+2*n[1]+2*n[2] + wk_max); - p->Dr0 = p->Jr0 + n[0]; - p->Js0 = p->Dr0 + n[0]; - p->Ds0 = p->Js0 + n[1]; - p->Jt0 = p->Ds0 + n[1]; - p->Dt0 = p->Jt0 + n[2]; - p->work = p->Dt0 + n[2]; -} - -static void obbox_data_free_3(obbox_data_3 *p) -{ - lob_bnd_base_free(&p->dr); - lob_bnd_ext_free(&p->ds); - lob_bnd_ext_free(&p->dt); - free(p->Jr0); -} - -static obbox_data_2 *obbox_setup_2(const real *const z[2], - const real *const w[2], - const unsigned n[2], const unsigned m[2]) -{ - const real zero = 0; - real *work; - obbox_data_2 *p = tmalloc(obbox_data_2,1); - obbox_data_alloc_2(p,n,m); - lob_bnd_base_setup(&p->dr,z[0],w[0]); - lob_bnd_base_setup(&p->ds,z[1],w[1]); - work = tmalloc(real,6*umax_2(n[0],n[1])); - lagrange_weights_deriv(z[0],n[0],&zero,1,p->Jr0,p->Dr0,work); - lagrange_weights_deriv(z[1],n[1],&zero,1,p->Js0,p->Ds0,work); - free(work); - return p; -} - -static obbox_data_3 *obbox_setup_3(const real *const z[3], - const real *const w[3], - const unsigned n[3], const unsigned m[3]) -{ - const real zero = 0; - real *work; - obbox_data_3 *p = tmalloc(obbox_data_3,1); - obbox_data_alloc_3(p,n,m); - lob_bnd_base_setup(&p->dr,z[0],w[0]); - lob_bnd_ext_setup(&p->ds,z[1],w[1]); - lob_bnd_ext_setup(&p->dt,z[2],w[2]); - work = tmalloc(real,6*umax_3(n[0],n[1],n[2])); - lagrange_weights_deriv(z[0],n[0],&zero,1,p->Jr0,p->Dr0,work); - lagrange_weights_deriv(z[1],n[1],&zero,1,p->Js0,p->Ds0,work); - lagrange_weights_deriv(z[2],n[2],&zero,1,p->Jt0,p->Dt0,work); - free(work); - return p; -} - -static void obbox_free_2(obbox_data_2 *p) -{ - obbox_data_free_2(p); - free(p); -} - -static void obbox_free_3(obbox_data_3 *p) -{ - obbox_data_free_3(p); - free(p); -} - -typedef struct { - real x[2], A[4], axis_bnd[4]; -} obbox_2; - -typedef struct { - real x[3], A[9], axis_bnd[6]; -} obbox_3; - -static int obbox_axis_test_2(const obbox_2 *p, const real x[2]) -{ - return (x[0]axis_bnd[0] || x[0]>p->axis_bnd[1] || - x[1]axis_bnd[2] || x[1]>p->axis_bnd[3]); -} - -static int obbox_axis_test_3(const obbox_3 *p, const real x[3]) -{ - return (x[0]axis_bnd[0] || x[0]>p->axis_bnd[1] || - x[1]axis_bnd[2] || x[1]>p->axis_bnd[3] || - x[2]axis_bnd[4] || x[2]>p->axis_bnd[5]); -} - -static int obbox_test_2(const obbox_2 *p, const real x[2], real r[2]) -{ - const real xt[2] = {x[0]-p->x[0],x[1]-p->x[1]}; - r[0] = p->A[0]*xt[0] + p->A[1]*xt[1]; - if(fabsr(r[0])>1) return 1; - r[1] = p->A[2]*xt[0] + p->A[3]*xt[1]; - return fabsr(r[1])>1; -} - -static int obbox_test_3(const obbox_3 *p, const real x[3], real r[3]) -{ - const real xt[3] = {x[0]-p->x[0],x[1]-p->x[1],x[2]-p->x[2]}; - r[0] = p->A[0]*xt[0] + p->A[1]*xt[1] + p->A[2]*xt[2]; - if(fabsr(r[0])>1) return 1; - r[1] = p->A[3]*xt[0] + p->A[4]*xt[1] + p->A[5]*xt[2]; - if(fabsr(r[1])>1) return 1; - r[2] = p->A[6]*xt[0] + p->A[7]*xt[1] + p->A[8]*xt[2]; - return fabsr(r[2])>1; -} - -static void obbox_calc_tfm_2(const real *x, const real *y, - unsigned n, unsigned s, - const real c0[2], const real A[4], real *u) -{ - unsigned i; - real *v = u+n; - for(i=0; ib[1]) b[1]=ob[1]; - if(ob[2]b[3]) b[3]=ob[3]; -} - -static void obbox_merge_3(real *b, const real *ob) -{ - if(ob[0]b[1]) b[1]=ob[1]; - if(ob[2]b[3]) b[3]=ob[3]; - if(ob[4]b[5]) b[5]=ob[5]; -} - -/* work holds 2*n + 2*m reals */ -static void obbox_side_2(const real *x, const real *y, - unsigned n, unsigned s, - const real c0[2], const real A[4], real *work, - const lob_bnd_base *lbd, real bnd[4]) -{ - obbox_calc_tfm_2(x,y,n,s,c0,A,work); - lob_bnd_1(lbd,work ,bnd ,work+2*n); - lob_bnd_1(lbd,work+n,bnd+2,work+2*n); -} - -/* work holds 3*nr*ns + 2*mr + 2*mr*ns + 2*mr*ms reals */ -static void obbox_side_3(const real *x, const real *y, const real *z, - unsigned nr, unsigned sr, unsigned ns, unsigned ss, - const real c0[3], const real A[9], real *work, - const lob_bnd_base *dr, const lob_bnd_ext *ds, - real bnd[6]) -{ - obbox_calc_tfm_3(x,y,z,nr,sr,ns,ss,c0,A,work); - lob_bnd_2(dr,ds,work ,bnd ,work+3*nr*ns); - lob_bnd_2(dr,ds,work+ nr*ns,bnd+2,work+3*nr*ns); - lob_bnd_2(dr,ds,work+2*nr*ns,bnd+4,work+3*nr*ns); -} - -/* return bounds on u = A (x - c0) - bnd[0] <= u_0 <= bnd[1] - bnd[2] <= u_1 <= bnd[3] */ -static void obbox_bnd_2(const obbox_data_2 *p, - const real *x, const real *y, - const real c0[2], const real A[4], - real bnd[4]) -{ - unsigned i, nr = p->dr.n, ns = p->ds.n; - real obnd[4]; - - i = nr*(ns-1); - obbox_side_2(x ,y , nr, 1, c0,A,p->work, &p->dr, bnd); - obbox_side_2(x+i,y+i, nr, 1, c0,A,p->work, &p->dr, obnd); - obbox_merge_2(bnd,obnd); - - i = nr-1; - obbox_side_2(x ,y , ns,nr, c0,A,p->work, &p->ds, obnd); - obbox_merge_2(bnd,obnd); - obbox_side_2(x+i,y+i, nr,nr, c0,A,p->work, &p->ds, obnd); - obbox_merge_2(bnd,obnd); -} - -/* return bounds on u = A (x - c0) - bnd[0] <= u_0 <= bnd[1] - bnd[2] <= u_1 <= bnd[3] - bnd[4] <= u_2 <= bnd[5] */ -static void obbox_bnd_3(const obbox_data_3 *p, - const real *x, const real *y, const real *z, - const real c0[3], const real A[9], - real bnd[6]) -{ - unsigned i, nr = p->dr.n, ns = p->ds.b.n, nt = p->dt.b.n; - real obnd[6]; - - i = nr*ns*(nt-1); - obbox_side_3(x ,y ,z , nr, 1,ns,0, c0,A,p->work, &p->dr ,&p->ds, bnd); - obbox_side_3(x+i,y+i,z+i, nr, 1,ns,0, c0,A,p->work, &p->dr ,&p->ds, obnd); - obbox_merge_3(bnd,obnd); - - i = nr*(ns-1); - obbox_side_3(x ,y ,z , nr, 1,nt,i, c0,A,p->work, &p->dr ,&p->dt, obnd); - obbox_merge_3(bnd,obnd); - obbox_side_3(x+i,y+i,z+i, nr, 1,nt,i, c0,A,p->work, &p->dr ,&p->dt, obnd); - obbox_merge_3(bnd,obnd); - - i = nr-1; - obbox_side_3(x ,y ,z , ns,nr,nt,0, c0,A,p->work, &p->ds.b,&p->dt, obnd); - obbox_merge_3(bnd,obnd); - obbox_side_3(x+i,y+i,z+i, ns,nr,nt,0, c0,A,p->work, &p->ds.b,&p->dt, obnd); - obbox_merge_3(bnd,obnd); -} - -static void obbox_calc_2(const obbox_data_2 *p, real tol, - const real *x, const real *y, obbox_2 *b) -{ - const real zero[2] = {0,0}, id[4] = {1,0,0,1}; - real c0[2], jac[4], inv[4], bnd[4], u0[2], d[2]; - - obbox_bnd_2(p,x,y,zero,id,b->axis_bnd); - d[0] = b->axis_bnd[1]-b->axis_bnd[0]; - d[1] = b->axis_bnd[3]-b->axis_bnd[2]; - b->axis_bnd[0] -= tol*d[0], b->axis_bnd[1] += tol*d[0]; - b->axis_bnd[2] -= tol*d[1], b->axis_bnd[3] += tol*d[1]; - - c0[0] = tensor_ig2(p->Jr0,p->Dr0,p->dr.n, - p->Js0,p->Ds0,p->ds.n, - x, jac , p->work); - c0[1] = tensor_ig2(p->Jr0,p->Dr0,p->dr.n, - p->Js0,p->Ds0,p->ds.n, - y, jac+2, p->work); - mat_inv_2(jac,inv); - - obbox_bnd_2(p,x,y,c0,inv,bnd); - - u0[0] = (bnd[0]+bnd[1])/2; - u0[1] = (bnd[2]+bnd[3])/2; - d[0] = 2/((1+tol)*(bnd[1]-bnd[0])); - d[1] = 2/((1+tol)*(bnd[3]-bnd[2])); - b->x[0] = c0[0] + jac[0]*u0[0] + jac[1]*u0[1]; - b->x[1] = c0[1] + jac[2]*u0[0] + jac[3]*u0[1]; - b->A[0] = d[0]*inv[0], b->A[1] = d[0]*inv[1]; - b->A[2] = d[1]*inv[2], b->A[3] = d[1]*inv[3]; -} - -static void obbox_calc_3(const obbox_data_3 *p, real tol, - const real *x, const real *y, const real *z, - obbox_3 *b) -{ - const real zero[3] = {0,0}, id[9] = {1,0,0,0,1,0,0,0,1}; - real c0[3], jac[9], inv[9], bnd[6], u0[3], d[3]; - - obbox_bnd_3(p,x,y,z,zero,id,b->axis_bnd); - d[0] = b->axis_bnd[1]-b->axis_bnd[0]; - d[1] = b->axis_bnd[3]-b->axis_bnd[2]; - d[2] = b->axis_bnd[5]-b->axis_bnd[4]; - b->axis_bnd[0] -= tol*d[0], b->axis_bnd[1] += tol*d[0]; - b->axis_bnd[2] -= tol*d[1], b->axis_bnd[3] += tol*d[1]; - b->axis_bnd[4] -= tol*d[2], b->axis_bnd[5] += tol*d[2]; - - c0[0] = tensor_ig3(p->Jr0,p->Dr0,p->dr.n, - p->Js0,p->Ds0,p->ds.b.n, - p->Jt0,p->Dt0,p->dt.b.n, - x, jac , p->work); - c0[1] = tensor_ig3(p->Jr0,p->Dr0,p->dr.n, - p->Js0,p->Ds0,p->ds.b.n, - p->Jt0,p->Dt0,p->dt.b.n, - y, jac+3, p->work); - c0[2] = tensor_ig3(p->Jr0,p->Dr0,p->dr.n, - p->Js0,p->Ds0,p->ds.b.n, - p->Jt0,p->Dt0,p->dt.b.n, - z, jac+6, p->work); - mat_inv_3(jac,inv); - - obbox_bnd_3(p,x,y,z,c0,inv,bnd); - - u0[0] = (bnd[0]+bnd[1])/2; - u0[1] = (bnd[2]+bnd[3])/2; - u0[2] = (bnd[4]+bnd[5])/2; - d[0] = 2/((1+tol)*(bnd[1]-bnd[0])); - d[1] = 2/((1+tol)*(bnd[3]-bnd[2])); - d[2] = 2/((1+tol)*(bnd[5]-bnd[4])); - b->x[0] = c0[0] + jac[0]*u0[0] + jac[1]*u0[1] + jac[2]*u0[2]; - b->x[1] = c0[1] + jac[3]*u0[0] + jac[4]*u0[1] + jac[5]*u0[2]; - b->x[2] = c0[2] + jac[6]*u0[0] + jac[7]*u0[1] + jac[8]*u0[2]; - b->A[0] = d[0]*inv[0], b->A[1] = d[0]*inv[1], b->A[2] = d[0]*inv[2]; - b->A[3] = d[1]*inv[3], b->A[4] = d[1]*inv[4], b->A[5] = d[1]*inv[5]; - b->A[6] = d[2]*inv[6], b->A[7] = d[2]*inv[7], b->A[8] = d[2]*inv[8]; -} - -/*-------------------------------------------------------------------------- - Point to Possible Elements Hashing - - Initializing the data: - unsigned nel; // number of elements - const unsigned n[3]; // number of nodes in r, s, t directions - const real *xm[3]; // n[0]*n[1]*n[2]*nel x,y,z coordinates - real tol = 0.01; // how far point is allowed to be outside element - // relative to element size - unsigned max_size = n[0]*n[1]*n[2]*nel; // maximum size of hash table - - hash_data_3 data; - hash_build_3(&data, xm, n, nel, max_size, tol); - - Using the data: - real x[3]; // point to find - - unsigned index = hash_index_3(&data, x); - unsigned i, b = data.offset[index], e = data.offset[index+1]; - - // point may be in elements - // data.offset[b], data.offset[b+1], ... , data.offset[e-1] - // - // list has maximum size data.max (e.g., e-b <= data.max) - - for(i=b; i!=e; ++i) { - unsigned el = data.offset[i]; - const obbox_3 *obb = &data.obb[el]; // bounding box data for element el - ... - } - - When done: - hash_free_3(&data); - - --------------------------------------------------------------------------*/ - -typedef struct { - unsigned hash_n; - real bnd[4]; /* bounds for all elements */ - real fac[2]; /* fac[i] = hash_n / (bnd[2*i+1]-bnd[2*i]) */ - obbox_2 *obb; /* obb[nel] -- bounding box info for each element */ - uint *offset; /* hash table -- for cell i,j: - uint index = j*hash_n+i, - b = offset[index ], - e = offset[index+1]; - elements in cell are - offset[b], offset[b+1], ..., offset[e-1] */ - unsigned max; /* maximum # of elements in any cell */ -} hash_data_2; - -typedef struct { - unsigned hash_n; - real bnd[6]; /* bounds for all elements */ - real fac[3]; /* fac[i] = hash_n / (bnd[2*i+1]-bnd[2*i]) */ - obbox_3 *obb; /* obb[nel] -- bounding box info for each element */ - uint *offset; /* hash table -- for cell i,j,k: - uint index = (k*hash_n+j)*hash_n+i, - b = offset[index ], - e = offset[index+1]; - elements in cell are - offset[b], offset[b+1], ..., offset[e-1] */ - unsigned max; /* maximum # of elements in any cell */ -} hash_data_3; - -static int ifloor(real x) -{ - /* - int y = x; - return (double)y > x ? y-1 : y; - */ - return floorr(x); -} - -static int iceil(real x) -{ - /* - int y = x; - return (double)y < x ? y+1 : y; - */ - return ceilr(x); -} - -static unsigned hash_index_helper(real low, real fac, unsigned n, real x) -{ - const int i = ifloor((x-low)*fac); - if(i<0) return 0; - return umin_2(i,n-1); -} - -static uint hash_index_2(const hash_data_2 *p, const real x[2]) -{ - const unsigned n = p->hash_n; - return (uint)hash_index_helper(p->bnd[2],p->fac[1],n,x[1])*n - +hash_index_helper(p->bnd[0],p->fac[0],n,x[0]); -} - -static uint hash_index_3(const hash_data_3 *p, const real x[3]) -{ - const unsigned n = p->hash_n; - return ( (uint)hash_index_helper(p->bnd[4],p->fac[2],n,x[2]) *n - +hash_index_helper(p->bnd[2],p->fac[1],n,x[1]) )*n - +hash_index_helper(p->bnd[0],p->fac[0],n,x[0]); -} - -static void hash_setfac_2(hash_data_2 *p, unsigned n) -{ - p->hash_n = n; - p->fac[0] = n/(p->bnd[1] - p->bnd[0]); - p->fac[1] = n/(p->bnd[3] - p->bnd[2]); -} - -static void hash_setfac_3(hash_data_3 *p, unsigned n) -{ - p->hash_n = n; - p->fac[0] = n/(p->bnd[1] - p->bnd[0]); - p->fac[1] = n/(p->bnd[3] - p->bnd[2]); - p->fac[2] = n/(p->bnd[5] - p->bnd[4]); -} - -static void hash_range_2(const hash_data_2 *p, uint i, unsigned d, - unsigned *ia, unsigned *ib) -{ - const real a = p->obb[i].axis_bnd[d*2 ]; - const real b = p->obb[i].axis_bnd[d*2+1]; - const int i0 = ifloor( (a - p->bnd[d*2]) * p->fac[d] ); - const unsigned i1 = iceil( (b - p->bnd[d*2]) * p->fac[d] ); - *ia = imax_2(0,i0); - *ib = imin_2(i1,p->hash_n); - if(*ib == *ia) ++(*ib); -} - -static void hash_range_3(const hash_data_3 *p, uint i, unsigned d, - unsigned *ia, unsigned *ib) -{ - const real a = p->obb[i].axis_bnd[d*2 ]; - const real b = p->obb[i].axis_bnd[d*2+1]; - const int i0 = ifloor( (a - p->bnd[d*2]) * p->fac[d] ); - const unsigned i1 = iceil( (b - p->bnd[d*2]) * p->fac[d] ); - *ia = imax_2(0,i0); - *ib = imin_2(i1,p->hash_n); - if(*ib == *ia) ++(*ib); -} - -static uint hash_count_2(hash_data_2 *p, uint nel, unsigned n) -{ - uint i,count=0; - hash_setfac_2(p,n); - for(i=0;i1) { - unsigned nm = nl+(nu-nl)/2; - uint size = (uint)nm*nm+1+hash_count_2(p,nel,nm); - if(size<=max_size) nl=nm,size_low=size; else nu=nm; - } - hash_setfac_2(p,nl); - return size_low; -} - -static uint hash_opt_size_3(hash_data_3 *p, uint nel, uint max_size) -{ - unsigned nl=1, nu = ceil(pow(max_size-nel,1.0/3)); - uint size_low = 2+nel; - while(nu-nl>1) { - unsigned nm = nl+(nu-nl)/2; - uint size = (uint)nm*nm*nm+1+hash_count_3(p,nel,nm); - if(size<=max_size) nl=nm,size_low=size; else nu=nm; - } - hash_setfac_3(p,nl); - return size_low; -} - -static void hash_getbb_2(hash_data_2 *p, const real *const elx[2], - const unsigned n[2], uint nel, real tol) -{ - obbox_data_2 *data; - const real *x[2]={elx[0],elx[1]}; - real *z[2], *w[2]; - uint i; unsigned d; - const unsigned nn = n[0]*n[1], m[2] = {2*n[0],2*n[1]}; - - z[0] = tmalloc(real,2*(n[0]+n[1])); - w[0] = z[0] + n[0]; - z[1] = w[0] + n[0], w[1] = z[1] + n[1]; - for(d=0;d<2;++d) - lobatto_nodes(z[d],n[d]), lobatto_weights(z[d],w[d],n[d]); - data = obbox_setup_2((const real *const*)z,(const real *const*)w,n,m); - obbox_calc_2(data,tol,x[0],x[1],&p->obb[0]); - memcpy(&p->bnd[0],(const real*)&p->obb[0].axis_bnd[0],4*sizeof(real)); - for(i=0;iobb[i]); - obbox_merge_2(&p->bnd[0],(const real*)&p->obb[i].axis_bnd[0]); - } - obbox_free_2(data); - free(z[0]); -} - -static void hash_getbb_3(hash_data_3 *p, const real *const elx[3], - const unsigned n[3], uint nel, real tol) -{ - obbox_data_3 *data; - const real *x[3]={elx[0],elx[1],elx[2]}; - real *z[3], *w[3]; - uint i; unsigned d; - const unsigned nn = n[0]*n[1]*n[2], m[3] = {2*n[0],2*n[1],2*n[2]}; - - z[0] = tmalloc(real,2*(n[0]+n[1]+n[2])); - w[0] = z[0] + n[0]; - for(d=1;d<3;++d) z[d]=w[d-1]+n[d-1], w[d]=z[d]+n[d]; - for(d=0;d<3;++d) - lobatto_nodes(z[d],n[d]), lobatto_weights(z[d],w[d],n[d]); - data = obbox_setup_3((const real *const*)z,(const real *const*)w,n,m); - obbox_calc_3(data,tol,x[0],x[1],x[2],&p->obb[0]); - memcpy(&p->bnd[0],(const real*)&p->obb[0].axis_bnd[0],6*sizeof(real)); - for(i=0;iobb[i]); - obbox_merge_3(&p->bnd[0],(const real*)&p->obb[i].axis_bnd[0]); - } - obbox_free_3(data); - free(z[0]); -} - -static void hash_build_2(hash_data_2 *p, const real *const x[2], - const unsigned n[2], uint nel, - uint max_hash_size, real tol) -{ - uint i,el,size,hn2,sum; unsigned hn; - unsigned *count; - p->obb = tmalloc(obbox_2,nel); - hash_getbb_2(p,x,n,nel,tol); - size = hash_opt_size_2(p,nel,max_hash_size); - p->offset = tmalloc(uint,size); - hn = p->hash_n; - hn2 = (uint)hn*hn; - count = tcalloc(unsigned,hn2); - for(el=0;elmax=count[0]; - p->offset[0]=sum; - for(i=0;ip->max) p->max=count[i]; - sum+=count[i]; - p->offset[i+1]=sum; - } - for(el=0;eloffset[p->offset[index+1] - count[index]] = el; - --count[index]; - } - } - free(count); -} - -static void hash_build_3(hash_data_3 *p, const real *const x[3], - const unsigned n[3], uint nel, - uint max_hash_size, real tol) -{ - uint i,el,size,hn3,sum; unsigned hn; - unsigned *count; - p->obb = tmalloc(obbox_3,nel); - hash_getbb_3(p,x,n,nel,tol); - size = hash_opt_size_3(p,nel,max_hash_size); - p->offset = tmalloc(uint,size); - hn = p->hash_n; - hn3 = (uint)hn*hn*hn; - count = tcalloc(unsigned,hn3); - for(el=0;elmax=count[0]; - p->offset[0]=sum; - for(i=0;ip->max) p->max=count[i]; - sum+=count[i]; - p->offset[i+1]=sum; - } - for(el=0;eloffset[p->offset[index+1] - count[index]] = el; - --count[index]; - } - } - free(count); -} - -static void hash_free_2(hash_data_2 *p) -{ - free(p->obb); - free(p->offset); -} - -static void hash_free_3(hash_data_3 *p) -{ - free(p->obb); - free(p->offset); -} - -/*-------------------------------------------------------------------------- - Optimization algorithm to find a point within an element - - Given x(r) (as values of x,y,z at all Lobatto nodes) and x_star, - find the r that minimizes || x_star - x(r) ||_2 - - As a minimization problem, the Newton step is - - __ 3 - [ J^T J - >_ d=1 resid_d H_d ] dr = J^t resid - - where resid = x_star - x(r), J = [ dx_i/dr_j ], - and H_d = [ d^2 x_d/dr_i dr_j ]. - - This is the appropriate step to take whenever constraints are active, - and the current iterate is on a boundary of the element. When the current - iterate is inside, J is square ( dim r = dim x ), resid will become small, - and the step - - J dr = resid - - may be used instead, still giving quadratic convergence. - - - Names use a _3 suffix for 3-d and _2 for 2-d. - The routines require an initialized lagrange_data array as input: - unsigned d, n[3] = { ... }; - real *z[3] = { tmalloc(real, n[0]), ... }; - for(d=0;d<3;++d) lobatto_nodes(z[d],n[d]); - - lagrange_data ld[3]; - for(d=0;d<3;++d) lagrange_setup(&ld[d],z[d],n[d]); - - Initialization: - opt_data_3 data; - opt_alloc_3(&data, ld); - - Use: - const real *xm[3]; // 3 pointers, each to n[0]*n[1]*n[2] reals - // giving the nodal x, y, or z coordinates - - const real x_star[3] = { ... }; // point to find - real r[3] = { 0,0,0 }; // initial guess with - unsigned c = opt_no_constraints_3; // these constraints active - - real dist = opt_findpt_3(&data,xm,x_star,r,&c); - // minimizer is r with constraints c; 2-norm of resid is dist - - Clean-up: - opt_free_3(&data); - - for(d=0;d<3;++d) lagrange_free(&ld[d]); - for(d=0;d<3;++d) free(z[d]); - - The constraint number works as follows. Let cr be the constraints - on the r variable: - cr = 0 r fixed at -1 - cr = 1 r not fixed - cr = 2 r fixed at 1 - Then the constraint number is (ct*3+cs)*3+cr - - --------------------------------------------------------------------------*/ - -static const unsigned opt_no_constraints_2 = 3+1; -static const unsigned opt_no_constraints_3 = 9+3+1; - -/* how many directions are constrained? */ -static const char opt_constr_num_2[9] = {2,1,2, 1,0,1, 2,1,2}; -static const char opt_constr_num_3[27] = { - 3,2,3, 2,1,2, 3,2,3, - 2,1,2, 1,0,1, 2,1,2, - 3,2,3, 2,1,2, 3,2,3 -}; - -/* which direction is constrained? */ -static const char opt_constr_dir_2[9] = {-1, 1,-1, 0,-1, 0, -1, 1,-1}; -static const char opt_constr_dir_3[27] = { - -1,-1,-1, -1, 2,-1, -1,-1,-1, - -1, 1,-1, 0,-1, 0, -1, 1,-1, - -1,-1,-1, -1, 2,-1, -1,-1,-1 -}; - -/* which direction is not constrained? */ -static const char opt_constr_not[27] = { - -1, 0,-1, 1,-1, 1, -1, 0,-1, - 2,-1, 2, -1,-1,-1, 2,-1, 2, - -1, 0,-1, 1,-1, 1, -1, 0,-1 -}; - -static const char opt_constr_wide[27] = { - 0x00,0x01,0x02, 0x04,0x05,0x06, 0x08,0x09,0x0a, - 0x10,0x11,0x12, 0x14,0x15,0x16, 0x18,0x19,0x1a, - 0x20,0x21,0x22, 0x24,0x25,0x26, 0x28,0x29,0x2a -}; - -static const unsigned opt_other1_3[3] = {1,0,0}, - opt_other2_3[3] = {2,2,1}; - -static unsigned opt_constr(unsigned constraints, unsigned d) -{ - return (opt_constr_wide[constraints]>>(d*2))&3; -} - -static void opt_constr_unpack_2(unsigned constraints, unsigned *c) -{ - const char cw = opt_constr_wide[constraints]; - c[0] = cw & 3; - c[1] = cw >> 2; -} - -static void opt_constr_unpack_3(unsigned constraints, unsigned *c) -{ - const char cw = opt_constr_wide[constraints]; - c[0] = cw & 3; - c[1] = (cw >> 2) & 3; - c[2] = cw >> 4; -} - -static unsigned opt_constr_pack_2(const unsigned *c) -{ - return c[1]*3+c[0]; -} - -static unsigned opt_constr_pack_3(const unsigned *c) -{ - return (c[2]*3+c[1])*3+c[0]; -} - -/*-------------------------------------------------------------------------- - - 3 - D - - --------------------------------------------------------------------------*/ - -typedef struct { - unsigned constraints; - unsigned dn, d1, d2; - real *x[3], *fdn[3]; -} opt_face_data_3; - -typedef struct { - unsigned constraints; - unsigned de, d1, d2; - real *x[3], *fd1[3], *fd2[3]; -} opt_edge_data_3; - -typedef struct { - unsigned constraints; - real x[3], jac[9]; -} opt_point_data_3; - -typedef struct { - lagrange_data *ld; - unsigned size[4]; - const real *elx[3]; - opt_face_data_3 fd; - opt_edge_data_3 ed; - opt_point_data_3 pd; - real *work; - real x[3], jac[9]; -} opt_data_3; - -static void opt_alloc_3(opt_data_3 *p, lagrange_data *ld) -{ - const unsigned nr = ld[0].n, ns = ld[1].n, nt = ld[2].n, - nf = umax_3(nr*ns,nr*nt,ns*nt), - ne = umax_3(nr,ns,nt), - nw = 2*ns*nt + 3*ns; - p->size[0] = 1; - p->size[1] = nr; - p->size[2] = nr*ns; - p->size[3] = p->size[2]*nt; - p->ld = ld; - p->work = tmalloc(real, 6*nf + 9*ne + nw); - p->fd.x[0] = p->work + nw; - p->fd.x[1] = p->fd.x[0] + nf; - p->fd.x[2] = p->fd.x[1] + nf; - p->fd.fdn[0] = p->fd.x[2] + nf; - p->fd.fdn[1] = p->fd.fdn[0] + nf; - p->fd.fdn[2] = p->fd.fdn[1] + nf; - p->ed.x[0] = p->fd.fdn[2] + nf; - p->ed.x[1] = p->ed.x[0] + ne; - p->ed.x[2] = p->ed.x[1] + ne; - p->ed.fd1[0] = p->ed.x[2] + ne; - p->ed.fd1[1] = p->ed.fd1[0] + ne; - p->ed.fd1[2] = p->ed.fd1[1] + ne; - p->ed.fd2[0] = p->ed.fd1[2] + ne; - p->ed.fd2[1] = p->ed.fd2[0] + ne; - p->ed.fd2[2] = p->ed.fd2[1] + ne; -} - -static void opt_free_3(opt_data_3 *p) -{ - free(p->work); -} - -static void opt_vol_set_3(opt_data_3 *p, const real r[3]) -{ - lagrange_1(&p->ld[0],r[0]); - lagrange_1(&p->ld[1],r[1]); - lagrange_1(&p->ld[2],r[2]); -} - -/* work holds 2*ns*nt + 3*ns reals */ -static void opt_vol_intp_3(opt_data_3 *p) -{ - unsigned d; - const lagrange_data *ld = p->ld; - - for(d=0;d<3;++d) - p->x[d] = tensor_ig3(ld[0].J,ld[0].D,ld[0].n, - ld[1].J,ld[1].D,ld[1].n, - ld[2].J,ld[2].D,ld[2].n, - p->elx[d], &p->jac[d*3], p->work); -} - -static void opt_vol_set_intp_3(opt_data_3 *p, const real r[3]) -{ - opt_vol_set_3(p,r); - opt_vol_intp_3(p); -} - -static void opt_face_proj_3(opt_data_3 *p) -{ - unsigned d, off=0; - const unsigned dn = p->fd.dn, d1 = p->fd.d1, d2 = p->fd.d2, - so = p->size[d2]-p->size[d1+1], - s1 = p->size[d1], sn = p->size[dn], - n1 = p->ld[d1].n, n2 = p->ld[d2].n, nn = p->ld[dn].n; - const real *D = p->ld[dn].D_z0; - if(opt_constr(p->fd.constraints,dn)==2) - off = p->size[dn+1]-p->size[dn], - D = p->ld[dn].D_zn; - for(d=0;d<3;++d) { - unsigned i,j,k,index=0; - const real *in = p->elx[d]+off; - for(j=n2;j;--j,in+=so) - for(i=n1;i;--i,++index,in+=s1) { - const real *ind = in-off; - real *fdn = &p->fd.fdn[d][index]; - p->fd.x[d][index] = *in; - *fdn = 0; - for(k=0;kfd.constraints!=constr) { - p->fd.constraints=constr; - p->fd.dn = opt_constr_dir_3[constr]; - p->fd.d1 = opt_other1_3[p->fd.dn]; - p->fd.d2 = opt_other2_3[p->fd.dn]; - opt_face_proj_3(p); - } - lagrange_1(&p->ld[p->fd.d1],r[p->fd.d1]); - lagrange_1(&p->ld[p->fd.d2],r[p->fd.d2]); -} - -/* work holds 2*ld[d2].n reals */ -static void opt_face_intp_3(opt_data_3 *p) -{ - unsigned d; - const unsigned dn = p->fd.dn, d1 = p->fd.d1, d2 = p->fd.d2, - n1 = p->ld[d1].n, n2 = p->ld[d2].n; - const real *J1 = p->ld[d1].J, *J2 = p->ld[d2].J, - *D1 = p->ld[d1].D, *D2 = p->ld[d2].D; - - for(d=0;d<3;++d) { - real g[2]; - p->x[d] = tensor_ig2(J1,D1,n1, J2,D2,n2, p->fd.x[d], &g[0], p->work); - p->jac[d*3+d1] = g[0]; - p->jac[d*3+d2] = g[1]; - p->jac[d*3+dn] = tensor_i2(J1,n1, J2,n2, p->fd.fdn[d], p->work); - } -} - -static void opt_face_set_intp_3(opt_data_3 *p, const real r[3], unsigned constr) -{ - opt_face_set_3(p,r,constr); - opt_face_intp_3(p); -} - -static void opt_face_hess_3(opt_data_3 *p, real hess[9]) -{ - unsigned d; - const unsigned d1 = p->fd.d1, d2 = p->fd.d2, - n1 = p->ld[d1].n, n2 = p->ld[d2].n; - const real *J1 = p->ld[d1].J , *J2 = p->ld[d2].J, - *D1 = p->ld[d1].D , *D2 = p->ld[d2].D, - *S1 = p->ld[d1].D2, *S2 = p->ld[d2].D2; - - lagrange_2u(&p->ld[d1]); - lagrange_2u(&p->ld[d2]); - - for(d=0;d<3;++d) { - (void) tensor_ig2(J1,S1,n1, J2,S2,n2, p->fd.x[d], hess+d*3, p->work); - hess[d*3+0] = tensor_i2(S1,n1, J2,n2, p->fd.x[d], p->work); - hess[d*3+1] = tensor_i2(J1,n1, S2,n2, p->fd.x[d], p->work); - hess[d*3+2] = tensor_i2(D1,n1, D2,n2, p->fd.x[d], p->work); - } -} - -static void opt_edge_proj_3(opt_data_3 *p) -{ - unsigned d, off, off1=0, off2=0; - const unsigned de=p->ed.de, d1=p->ed.d1, d2=p->ed.d2, - se=p->size[de], s1=p->size[d1], s2=p->size[d2], - ne=p->ld[de].n, n1=p->ld[d1].n, n2=p->ld[d2].n; - const real *fD1, *fD2; - if(opt_constr(p->ed.constraints,d1)==0) - fD1=p->ld[d1].D_z0; - else - fD1=p->ld[d1].D_zn, off1 = p->size[d1+1]-p->size[d1]; - if(opt_constr(p->ed.constraints,d2)==0) - fD2=p->ld[d2].D_z0; - else - fD2=p->ld[d2].D_zn, off2 = p->size[d2+1]-p->size[d2]; - off = off1+off2; - for(d=0;d<3;++d) { - unsigned i,j; - const real *in = p->elx[d]+off; - for(i=0;ied.fd1[d][i], *fd2 = &p->ed.fd2[d][i]; - p->ed.x[d][i] = *in; - *fd1 = *fd2 = 0; - for(j=0;jed.constraints!=constr) { - p->ed.constraints=constr; - p->ed.de = opt_constr_not[constr]; - p->ed.d1 = opt_other1_3[p->ed.de]; - p->ed.d2 = opt_other2_3[p->ed.de]; - opt_edge_proj_3(p); - } - lagrange_1(&p->ld[p->ed.de],r[p->ed.de]); -} - -static void opt_edge_intp_3(opt_data_3 *p) -{ - unsigned d; - const unsigned de = p->ed.de, d1 = p->ed.d1, d2 = p->ed.d2, - n = p->ld[de].n; - const real *J = p->ld[de].J, *D = p->ld[de].D; - - for(d=0;d<3;++d) { - p->x[d] = tensor_ig1(J,D,n, p->ed.x[d], &p->jac[d*3+de]); - p->jac[d*3+d1] = tensor_i1(J,n, p->ed.fd1[d]); - p->jac[d*3+d2] = tensor_i1(J,n, p->ed.fd2[d]); - } -} - -static void opt_edge_set_intp_3(opt_data_3 *p, const real r[3], unsigned constr) -{ - opt_edge_set_3(p,r,constr); - opt_edge_intp_3(p); -} - -static void opt_edge_hess_3(opt_data_3 *p, real hess[3]) -{ - unsigned d; - const unsigned de = p->ed.de, n = p->ld[de].n; - const real *D2 = p->ld[de].D2; - lagrange_2u(&p->ld[de]); - for(d=0;d<3;++d) hess[d] = tensor_i1(D2,n, p->ed.x[d]); -} - -static void opt_point_proj_3(opt_data_3 *p) -{ - unsigned off[3], offt, d, c[3]; - const real *fD[3]; - opt_constr_unpack_3(p->pd.constraints,c); - for(d=0;d<3;++d) - if(c[d]==0) - fD[d]=p->ld[d].D_z0,off[d]=0; - else - fD[d]=p->ld[d].D_zn,off[d]=p->size[d+1]-p->size[d]; - offt = off[0]+off[1]+off[2]; - for(d=0;d<9;++d) p->pd.jac[d]=0; - for(d=0;d<3;++d) { - unsigned i,j; - p->pd.x[d] = p->elx[d][offt]; - for(i=0;i<3;++i) { - const real *in = p->elx[d]+offt-off[i]; - for(j=0;jld[i].n;++j,in+=p->size[i]) - p->pd.jac[d*3+i] += *in * fD[i][j]; - } - } -} - -static void opt_point_set_3(opt_data_3 *p, unsigned constr) -{ - if(p->pd.constraints!=constr) { - p->pd.constraints=constr; - opt_point_proj_3(p); - } -} - -static void opt_point_intp_3(opt_data_3 *p) -{ - memcpy(p->x,p->pd.x,3*sizeof(real)); - memcpy(p->jac,p->pd.jac,9*sizeof(real)); -} - -static void opt_point_set_intp_3(opt_data_3 *p, unsigned constr) -{ - opt_point_set_3(p,constr); - opt_point_intp_3(p); -} - -#define DIAGNOSTICS 0 - -static double opt_findpt_3(opt_data_3 *p, const real *const elx[3], - const real xstar[3], real r[3], unsigned *constr) -{ - real dr[3], resid[3], steep[3]; - - unsigned c=*constr,ac,d,cc[3],step=0; - - p->elx[0]=elx[0], p->elx[1]=elx[1], p->elx[2]=elx[2]; - - p->fd.constraints = opt_no_constraints_3; - p->ed.constraints = opt_no_constraints_3; - p->pd.constraints = opt_no_constraints_3; - -# if DIAGNOSTICS - printf("opt_findpt: xstar = %g, %g, %g\n", xstar[0], xstar[1], xstar[2]); -# endif - - do { - ++step; - if(step==50) fail("%s: opt_findpt_3 did not converge\n",__FILE__); -# if DIAGNOSTICS - printf(" iteration %u\n", step); - printf(" %d constraint(s) active\n", (int)opt_constr_num_3[c]); -# endif - /* update face/edge/point data if necessary, - and evaluate x(r) as well as the jacobian */ - switch(opt_constr_num_3[c]) { - case 0: opt_vol_set_intp_3(p,r); break; - case 1: opt_face_set_intp_3(p,r,c); break; - case 2: opt_edge_set_intp_3(p,r,c); break; - case 3: opt_point_set_intp_3(p,c); break; - } -# if DIAGNOSTICS - printf(" r = %g, %g, %g\n", r[0], r[1], r[2]); - printf(" x = %g, %g, %g\n", p->x[0], p->x[1], p->x[2]); -# endif - /* compute residual */ - for(d=0;d<3;++d) resid[d]=xstar[d]-p->x[d]; -# if DIAGNOSTICS - printf(" resid = %g, %g, %g\n", resid[0], resid[1], resid[2]); - printf(" 2-norm = %g\n", r2norm_3(resid[0],resid[1],resid[2])); -# endif - /* check constraints against steepest descent direction */ - ac = c; - if(opt_constr_num_3[c]) { - opt_constr_unpack_3(c,cc); - mat_app_3c(steep,p->jac,resid); /* steepest descent = J^T r */ -# if DIAGNOSTICS - printf(" steepest descent = %g, %g, %g\n", steep[0],steep[1],steep[2]); -# endif - for(d=0;d<3;++d) - if((cc[d]==0 && steep[d]>0) || (cc[d]==2 && steep[d]<0)) cc[d]=1; - ac = opt_constr_pack_3(cc); - } - /* update face/edge/point data if necessary */ - if(ac!=c) { - c=ac; -# if DIAGNOSTICS - printf(" relaxed to %d constraints\n", (int)opt_constr_num_3[c]); -# endif - switch(opt_constr_num_3[c]) { - case 1: opt_face_set_3(p,r,c); break; - case 2: opt_edge_set_3(p,r,c); break; - case 3: opt_point_set_3(p,c); break; - } - } - /* compute Newton step */ - switch(opt_constr_num_3[c]) { - case 0: tinyla_solve_3(dr,p->jac,resid); break; - case 1: { - const unsigned dn = p->fd.dn, d1 = p->fd.d1, d2 = p->fd.d2; - real A[4], H[9]; - const real *J = p->jac; - opt_face_hess_3(p,H); - A[0] = J[d1]*J[d1] + J[3+d1]*J[3+d1] + J[6+d1]*J[6+d1]; - A[1] = J[d2]*J[d2] + J[3+d2]*J[3+d2] + J[6+d2]*J[6+d2]; - A[2] = J[d1]*J[d2] + J[3+d1]*J[3+d2] + J[6+d1]*J[6+d2]; - A[0] -= resid[0]*H[0] + resid[1]*H[3] + resid[2]*H[6]; - A[1] -= resid[0]*H[1] + resid[1]*H[4] + resid[2]*H[7]; - A[2] -= resid[0]*H[2] + resid[1]*H[5] + resid[2]*H[8]; - tinyla_solve_sym_2(&dr[d1],&dr[d2],A,steep[d1],steep[d2]); - dr[dn]=0; - } break; - case 2: { - const unsigned de = p->ed.de, d1 = p->ed.d1, d2 = p->ed.d2; - real fac, H[3]; - const real *J = p->jac+de; - opt_edge_hess_3(p,H); - fac = J[0]*J[0]+J[3]*J[3]+J[6]*J[6] - -(resid[0]*H[0]+resid[1]*H[1]+resid[2]*H[2]); - dr[de] = steep[de] / fac; - dr[d1] = 0, dr[d2] = 0; - } break; - case 3: - dr[0] = dr[1] = dr[2] = 0; - break; - } -# if DIAGNOSTICS - printf(" dr = %g, %g, %g\n", dr[0], dr[1], dr[2]); -# endif - /* project new iteration onto [-1,1]^3 */ - opt_constr_unpack_3(c,cc); - for(d=0;d<3;++d) { - if(cc[d]!=1) continue; - r[d] += dr[d]; - if(r[d] <= -1) - dr[d] -= r[d]+1, r[d] = -1, cc[d]=0; - else if(r[d] >= 1) - dr[d] -= r[d]-1, r[d] = 1, cc[d]=2; - } - c = opt_constr_pack_3(cc); - } while(r1norm_3(dr[0],dr[1],dr[2]) > 30*EPS); - *constr = c; -# if 0 - printf("opt_findpt_3 converged in %u iterations\n", step); -# endif - return r2norm_3(resid[0],resid[1],resid[2]); -} - -#undef DIAGNOSTICS - -/*-------------------------------------------------------------------------- - - 2 - D - - --------------------------------------------------------------------------*/ - -typedef struct { - unsigned constraints; - unsigned de, d1; - real *x[2], *fd1[2]; -} opt_edge_data_2; - -typedef struct { - unsigned constraints; - real x[2], jac[4]; -} opt_point_data_2; - -typedef struct { - lagrange_data *ld; - unsigned size[3]; - const real *elx[2]; - opt_edge_data_2 ed; - opt_point_data_2 pd; - real *work; - real x[2], jac[4]; -} opt_data_2; - -static void opt_alloc_2(opt_data_2 *p, lagrange_data *ld) -{ - const unsigned nr = ld[0].n, ns = ld[1].n, - ne = umax_2(nr,ns), - nw = 2*ns; - p->size[0] = 1; - p->size[1] = nr; - p->size[2] = nr*ns; - p->ld = ld; - p->work = tmalloc(real, 4*ne + nw); - p->ed.x[0] = p->work + nw; - p->ed.x[1] = p->ed.x[0] + ne; - p->ed.fd1[0] = p->ed.x[1] + ne; - p->ed.fd1[1] = p->ed.fd1[0] + ne; -} - -static void opt_free_2(opt_data_2 *p) -{ - free(p->work); -} - -static void opt_area_set_2(opt_data_2 *p, const real r[2]) -{ - lagrange_1(&p->ld[0],r[0]); - lagrange_1(&p->ld[1],r[1]); -} - -/* work holds 2*ns reals */ -static void opt_area_intp_2(opt_data_2 *p) -{ - unsigned d; - const lagrange_data *ld = p->ld; - - for(d=0;d<2;++d) - p->x[d] = tensor_ig2(ld[0].J,ld[0].D,ld[0].n, - ld[1].J,ld[1].D,ld[1].n, - p->elx[d], &p->jac[d*2], p->work); -} - -static void opt_area_set_intp_2(opt_data_2 *p, const real r[2]) -{ - opt_area_set_2(p,r); - opt_area_intp_2(p); -} - -static void opt_edge_proj_2(opt_data_2 *p) -{ - unsigned d, off=0; - const unsigned de = p->ed.de, d1 = p->ed.d1, - se=p->size[de], s1=p->size[d1], - ne=p->ld[de].n, n1=p->ld[d1].n; - const real *fD1; - if(opt_constr(p->ed.constraints,d1)==0) - fD1=p->ld[d1].D_z0; - else - fD1=p->ld[d1].D_zn, off = p->size[d1+1]-p->size[d1]; - for(d=0;d<2;++d) { - unsigned i,j; - const real *in = p->elx[d]+off; - for(i=0;ied.fd1[d][i]; - p->ed.x[d][i] = *in; - *fd1 = 0; - for(j=0;jed.constraints!=constr) { - p->ed.constraints=constr; - p->ed.de = opt_constr_not[constr]; - p->ed.d1 = 1 - p->ed.de; - opt_edge_proj_2(p); - } - lagrange_1(&p->ld[p->ed.de],r[p->ed.de]); -} - -static void opt_edge_intp_2(opt_data_2 *p) -{ - unsigned d; - const unsigned de = p->ed.de, d1 = p->ed.d1, n = p->ld[de].n; - const real *J = p->ld[de].J, *D = p->ld[de].D; - for(d=0;d<2;++d) { - p->x[d] = tensor_ig1(J,D,n, p->ed.x[d], &p->jac[d*2+de]); - p->jac[d*2+d1] = tensor_i1(J,n, p->ed.fd1[d]); - } -} - -static void opt_edge_set_intp_2(opt_data_2 *p, const real r[2], unsigned constr) -{ - opt_edge_set_2(p,r,constr); - opt_edge_intp_2(p); -} - -static void opt_edge_hess_2(opt_data_2 *p, real hess[2]) -{ - unsigned d; - const unsigned de = p->ed.de, n = p->ld[de].n; - const real *D2 = p->ld[de].D2; - lagrange_2u(&p->ld[de]); - for(d=0;d<2;++d) hess[d] = tensor_i1(D2,n, p->ed.x[d]); -} - -static void opt_point_proj_2(opt_data_2 *p) -{ - unsigned off[2], offt, d, c[2]; - const real *fD[2]; - opt_constr_unpack_2(p->pd.constraints,c); - for(d=0;d<2;++d) - if(c[d]==0) - fD[d]=p->ld[d].D_z0,off[d]=0; - else - fD[d]=p->ld[d].D_zn,off[d]=p->size[d+1]-p->size[d]; - offt = off[0]+off[1]; - for(d=0;d<4;++d) p->pd.jac[d]=0; - for(d=0;d<2;++d) { - unsigned i,j; - p->pd.x[d] = p->elx[d][offt]; - for(i=0;i<2;++i) { - const real *in = p->elx[d]+offt-off[i]; - for(j=0;jld[i].n;++j,in+=p->size[i]) - p->pd.jac[d*2+i] += *in * fD[i][j]; - } - } -} - -static void opt_point_set_2(opt_data_2 *p, unsigned constr) -{ - if(p->pd.constraints!=constr) { - p->pd.constraints=constr; - opt_point_proj_2(p); - } -} - -static void opt_point_intp_2(opt_data_2 *p) -{ - memcpy(p->x,p->pd.x,2*sizeof(real)); - memcpy(p->jac,p->pd.jac,4*sizeof(real)); -} - -static void opt_point_set_intp_2(opt_data_2 *p, unsigned constr) -{ - opt_point_set_2(p,constr); - opt_point_intp_2(p); -} - -#define DIAGNOSTICS 0 - -static double opt_findpt_2(opt_data_2 *p, const real *const elx[2], - const real xstar[2], real r[2], unsigned *constr) -{ - real dr[2], resid[2], steep[2]; - - unsigned c=*constr,ac,d,cc[2],step=0; - - p->elx[0]=elx[0], p->elx[1]=elx[1]; - - p->ed.constraints = opt_no_constraints_2; - p->pd.constraints = opt_no_constraints_2; - -# if DIAGNOSTICS - printf("opt_findpt: xstar = %g, %g\n", xstar[0], xstar[1]); -# endif - - do { - ++step; - if(step==150) fail("%s: opt_findpt_2 did not converge\n",__FILE__); -# if DIAGNOSTICS - printf(" iteration %u\n", step); - printf(" %d constraint(s) active\n", (int)opt_constr_num_2[c]); -# endif - /* update face/edge/point data if necessary, - and evaluate x(r) as well as the jacobian */ - switch(opt_constr_num_2[c]) { - case 0: opt_area_set_intp_2(p,r); break; - case 1: opt_edge_set_intp_2(p,r,c); break; - case 2: opt_point_set_intp_2(p,c); break; - } -# if DIAGNOSTICS - printf(" r = %g, %g\n", r[0], r[1]); - printf(" x = %g, %g\n", p->x[0], p->x[1]); -# endif - /* compute residual */ - for(d=0;d<2;++d) resid[d]=xstar[d]-p->x[d]; -# if DIAGNOSTICS - printf(" resid = %g, %g\n", resid[0], resid[1]); - printf(" 2-norm = %g\n", r2norm_2(resid[0],resid[1])); -# endif - /* check constraints against steepest descent direction */ - ac = c; - if(opt_constr_num_2[c]) { - opt_constr_unpack_2(c,cc); - mat_app_2c(steep,p->jac,resid); /* steepest descent = J^T r */ -# if DIAGNOSTICS - printf(" steepest descent = %g, %g\n", steep[0], steep[1]); -# endif - for(d=0;d<2;++d) - if((cc[d]==0 && steep[d]>0) || (cc[d]==2 && steep[d]<0)) cc[d]=1; - ac = opt_constr_pack_2(cc); - } - /* update face/edge/point data if necessary */ - if(ac!=c) { - c=ac; -# if DIAGNOSTICS - printf(" relaxed to %d constraints\n", (int)opt_constr_num_2[c]); -# endif - switch(opt_constr_num_2[c]) { - case 1: opt_edge_set_2(p,r,c); break; - case 2: opt_point_set_2(p,c); break; - } - } - /* compute Newton step */ - switch(opt_constr_num_2[c]) { - case 0: tinyla_solve_2(dr,p->jac,resid); break; - case 1: { - const unsigned de = p->ed.de, d1 = p->ed.d1; - real fac, H[2]; - const real *J = p->jac+de; - opt_edge_hess_2(p,H); - fac = J[0]*J[0]+J[2]*J[2]-(resid[0]*H[0]+resid[1]*H[1]); - dr[de] = steep[de] / fac; - dr[d1] = 0; - } break; - case 2: - dr[0] = dr[1] = 0; - break; - } -# if DIAGNOSTICS - printf(" dr = %g, %g\n", dr[0], dr[1]); -# endif - /* project new iteration onto [-1,1]^2 */ - opt_constr_unpack_2(c,cc); - for(d=0;d<2;++d) { - if(cc[d]!=1) continue; - r[d] += dr[d]; - if(r[d] <= -1) - dr[d] -= r[d]+1, r[d] = -1, cc[d]=0; - else if(r[d] >= 1) - dr[d] -= r[d]-1, r[d] = 1, cc[d]=2; - } - c = opt_constr_pack_2(cc); - } while(r1norm_2(dr[0],dr[1]) > 30*EPS); - *constr = c; - return r2norm_2(resid[0],resid[1]); -} - -#undef DIAGNOSTICS - -/*-------------------------------------------------------------------------- - Point Finding (interface/top-level) - - Initializing the data: - unsigned nel; // number of elements - const unsigned n[3]; // number of nodes in r, s, t directions - const real *xm[3]; // n[0]*n[1]*n[2]*nel x,y,z coordinates - real tol = 0.01; // how far point is allowed to be outside element - // relative to element size - unsigned max_size = n[0]*n[1]*n[2]*nel; // maximum size of hash table - - findpt_data_3 *data = findpt_setup_3(xm,n,nel,max_size,tol); - - Using the data: - real x[3] = { ... }; // point to find - int el; // element number - real r[3]; // parametric coordinates - int guess = 0; // do we use (el,r,s,t) as an initial guess? - int code; // 0 : normal, -1 : outside all elements, - // 1 : border, or outside but within tolerance - real dist; // distance in xyz space from returned (el,r,s,t) to given - // (x,y,z) - - code = findpt_3(data, x, guess, &el, r, &dist); - - When done: - findpt_free_3(&data); - - --------------------------------------------------------------------------*/ - -typedef struct { - uint el; - real r[3]; - real dist; -} findpt_listel; - -/* heap sort on A[0:n-1] with key A[i]->dist - precondition: n!=0 */ -static void findpt_list_sort(findpt_listel **A, unsigned n) -{ - unsigned i; - --A; /* make A have a base index of 1 */ - /* build heap */ - for(i=2;i<=n;++i) { - findpt_listel *item = A[i]; - unsigned hole = i, parent = hole>>1; - if(A[parent]->dist >= item->dist) continue; - do { - A[hole] = A[parent]; - hole = parent; - parent>>=1; - } while(parent && A[parent]->dist < item->dist); - A[hole] = item; - } - /* extract */ - for(i=n-1;i;--i) { - findpt_listel *item = A[i+1]; - unsigned hole = 1; - A[i+1] = A[1]; - for(;;) { - unsigned ch = hole<<1, r = ch+1; - if(r<=i && A[ch]->dist < A[r]->dist) ch=r; - if(ch>i || item->dist >= A[ch]->dist) break; - A[hole]=A[ch]; - hole=ch; - } - A[hole] = item; - } -} - -typedef struct { - const real *xw[2]; /* geometry data */ - real *z[2]; /* lobatto nodes */ - lagrange_data ld[2]; /* interpolation, derivative weights & data */ - unsigned nptel; /* nodes per element */ - hash_data_2 *hash; /* geometric hashing data */ - findpt_listel *list, **sorted, **end; /* pre-allocated list of elements to - check (found by hashing), and - pre-allocated list of pointers into - the first list for sorting */ - opt_data_2 *od; /* data for the optimization algorithm */ - real *od_work; -} findpt_data_2; - -typedef struct { - const real *xw[3]; /* geometry data */ - real *z[3]; /* lobatto nodes */ - lagrange_data ld[3]; /* interpolation, derivative weights & data */ - unsigned nptel; /* nodes per element */ - hash_data_3 *hash; /* geometric hashing data */ - findpt_listel *list, **sorted, **end; /* pre-allocated list of elements to - check (found by hashing), and - pre-allocated list of pointers into - the first list for sorting */ - opt_data_3 *od; /* data for the optimization algorithm */ - real *od_work; -} findpt_data_3; - -findpt_data_2 *findpt_setup_2( - const real *const xw[2], const unsigned n[2], uint nel, - uint max_hash_size, real bbox_tol) -{ - unsigned d; - findpt_data_2 *p = tmalloc(findpt_data_2,1); - - p->hash = tmalloc(hash_data_2,1); - p->od = tmalloc(opt_data_2,1); - - for(d=0;d<2;++d) p->xw[d]=xw[d]; - p->nptel = n[0]*n[1]; - - hash_build_2(p->hash,xw,n,nel,max_hash_size,bbox_tol); - - for(d=0;d<2;++d) { - p->z[d] = tmalloc(real,n[d]); - lobatto_nodes(p->z[d],n[d]); - lagrange_setup(&p->ld[d],p->z[d],n[d]); - } - - p->list = tmalloc(findpt_listel , p->hash->max); - p->sorted = tmalloc(findpt_listel*, p->hash->max); - - opt_alloc_2(p->od,p->ld); - p->od_work = p->od->work; - - return p; -} - -findpt_data_3 *findpt_setup_3( - const real *const xw[3], const unsigned n[3], uint nel, - uint max_hash_size, real bbox_tol) -{ - unsigned d; - findpt_data_3 *p = tmalloc(findpt_data_3,1); - - p->hash = tmalloc(hash_data_3,1); - p->od = tmalloc(opt_data_3,1); - - for(d=0;d<3;++d) p->xw[d]=xw[d]; - p->nptel = n[0]*n[1]*n[2]; - - hash_build_3(p->hash,xw,n,nel,max_hash_size,bbox_tol); - - for(d=0;d<3;++d) { - p->z[d] = tmalloc(real,n[d]); - lobatto_nodes(p->z[d],n[d]); - lagrange_setup(&p->ld[d],p->z[d],n[d]); - } - - p->list = tmalloc(findpt_listel , p->hash->max); - p->sorted = tmalloc(findpt_listel*, p->hash->max); - - opt_alloc_3(p->od,p->ld); - p->od_work = p->od->work; - - return p; -} - -void findpt_free_2(findpt_data_2 *p) -{ - unsigned d; - opt_free_2(p->od); free(p->od); - hash_free_2(p->hash); free(p->hash); - free(p->list); - free(p->sorted); - for(d=0;d<2;++d) free(p->z[d]); - free(p); -} - -void findpt_free_3(findpt_data_3 *p) -{ - unsigned d; - opt_free_3(p->od); free(p->od); - hash_free_3(p->hash); free(p->hash); - free(p->list); - free(p->sorted); - for(d=0;d<3;++d) free(p->z[d]); - free(p); -} - -const real *findpt_allbnd_2(const findpt_data_2 *p) -{ - return p->hash->bnd; -} - -const real *findpt_allbnd_3(const findpt_data_3 *p) -{ - return p->hash->bnd; -} - -static void findpt_hash_2(findpt_data_2 *p, const real x[2]) -{ - findpt_listel *list = p->list, **sorted = p->sorted; - const uint hi = hash_index_2(p->hash, x); - const uint *offset = p->hash->offset; - uint i; const uint b = offset[hi], e = offset[hi+1]; - for(i=b;i!=e;++i) { - const uint el = offset[i]; - real *r = &list->r[0]; - const obbox_2 *obb = &p->hash->obb[el]; - if(obbox_axis_test_2(obb,x)) continue; - if(obbox_test_2(obb,x,r)) continue; - list->el = el; - list->dist = r1norm_2(r[0],r[1]); - *sorted++ = list++; - } - p->end = sorted; - if(p->end!=p->sorted) - findpt_list_sort(p->sorted,p->end - p->sorted); -} - -static void findpt_hash_3(findpt_data_3 *p, const real x[3]) -{ - findpt_listel *list = p->list, **sorted = p->sorted; - const uint hi = hash_index_3(p->hash, x); - const uint *offset = p->hash->offset; - uint i; const uint b = offset[hi], e = offset[hi+1]; - for(i=b;i!=e;++i) { - const uint el = offset[i]; - real *r = &list->r[0]; - const obbox_3 *obb = &p->hash->obb[el]; - if(obbox_axis_test_3(obb,x)) continue; - if(obbox_test_3(obb,x,r)) continue; - list->el = el; - list->dist = r1norm_3(r[0],r[1],r[2]); - *sorted++ = list++; - } - p->end = sorted; - if(p->end!=p->sorted) - findpt_list_sort(p->sorted,p->end - p->sorted); -} - -static int findpt_guess_2(findpt_data_2 *p, const real x[2], - uint el, real r[2], real *dist) -{ - const uint index = p->nptel*el; - const real *elx[2] = {p->xw[0]+index,p->xw[1]+index}; - real g[2]; - unsigned c = opt_no_constraints_2; - const obbox_2 *obb = &p->hash->obb[el]; - if(obbox_axis_test_2(obb,x) || obbox_test_2(obb,x,g)) return 0; - *dist = opt_findpt_2(p->od,elx,x,r,&c); - return c==opt_no_constraints_2; -} - -static int findpt_guess_3(findpt_data_3 *p, const real x[3], - uint el, real r[3], real *dist) -{ - const uint index = p->nptel*el; - const real *elx[3] = {p->xw[0]+index,p->xw[1]+index,p->xw[2]+index}; - real g[3]; - unsigned c = opt_no_constraints_3; - const obbox_3 *obb = &p->hash->obb[el]; - if(obbox_axis_test_3(obb,x) || obbox_test_3(obb,x,g)) return 0; - *dist = opt_findpt_3(p->od,elx,x,r,&c); - return c==opt_no_constraints_3; -} - -#define DIAGNOSTICS 0 - -static int findpt_pass_2(findpt_data_2 *p, const real x[2], - uint *el, real r[2], real *dist_min) -{ - findpt_listel **qq = p->sorted; - const real *bnd; - do { - findpt_listel *q = *qq; - const uint index = p->nptel*q->el; - const real *elx[2] = {p->xw[0]+index,p->xw[1]+index}; - unsigned c = opt_no_constraints_2; - const real dist = opt_findpt_2(p->od,elx,x,q->r,&c); - if(qq==p->sorted || dist<*dist_min || c==opt_no_constraints_2) { - *dist_min = dist; - *el = q->el; - memcpy(r, q->r, 2*sizeof(real)); - if(c==opt_no_constraints_2) return 0; - } - } while(++qq != p->end); - bnd = p->hash->obb[*el].axis_bnd; - return *dist_min>r2norm_2(bnd[1]-bnd[0],bnd[3]-bnd[2]) ? -1 : 1; -} - -static int findpt_pass_3(findpt_data_3 *p, const real x[3], - uint *el, real r[3], real *dist_min) -{ - findpt_listel **qq = p->sorted; - const real *bnd; - do { - findpt_listel *q = *qq; - const uint index = p->nptel*q->el; - const real *elx[3] = {p->xw[0]+index,p->xw[1]+index,p->xw[2]+index}; - unsigned c = opt_no_constraints_3; - const real dist = opt_findpt_3(p->od,elx,x,q->r,&c); - if(qq==p->sorted || dist<*dist_min || c==opt_no_constraints_3) { - *dist_min = dist; - *el = q->el; - memcpy(r, q->r, 3*sizeof(real)); - if(c==opt_no_constraints_3) { -# if DIAGNOSTICS - printf("point found in element #%d\n", qq-p->sorted); -# endif - return 0; - } - } - } while(++qq != p->end); - bnd = p->hash->obb[*el].axis_bnd; - return *dist_min>r2norm_3(bnd[1]-bnd[0],bnd[3]-bnd[2],bnd[5]-bnd[4]) ? -1 : 1; -} - -int findpt_2(findpt_data_2 *p, const real x[2], int guess, - uint *el, real r[2], real *dist) -{ - if(guess && findpt_guess_2(p,x,*el,r,dist)) return 0; - findpt_hash_2(p,x); - if(p->sorted==p->end) return -1; - return findpt_pass_2(p,x,el,r,dist); -} - -int findpt_3(findpt_data_3 *p, const real x[3], int guess, - uint *el, real r[3], real *dist) -{ - if(guess && findpt_guess_3(p,x,*el,r,dist)) return 0; - findpt_hash_3(p,x); -# if DIAGNOSTICS - printf("hashing leaves %d elements to consider\n",p->end-p->sorted); -# endif - if(p->sorted==p->end) return -1; - return findpt_pass_3(p,x,el,r,dist); -} - -static void findpt_weights_2(findpt_data_2 *p, const real r[2]) -{ - lagrange_0(&p->ld[0],r[0]); - lagrange_0(&p->ld[1],r[1]); -} - -static void findpt_weights_3(findpt_data_3 *p, const real r[3]) -{ - lagrange_0(&p->ld[0],r[0]); - lagrange_0(&p->ld[1],r[1]); - lagrange_0(&p->ld[2],r[2]); -} - -static double findpt_eval_2(findpt_data_2 *p, const real *u) -{ - return tensor_i2(p->ld[0].J,p->ld[0].n, - p->ld[1].J,p->ld[1].n, - u, p->od_work); -} - -static double findpt_eval_3(findpt_data_3 *p, const real *u) -{ - return tensor_i3(p->ld[0].J,p->ld[0].n, - p->ld[1].J,p->ld[1].n, - p->ld[2].J,p->ld[2].n, - u, p->od_work); -} - diff --git a/3rdParty/gslib.github/src/findpt.h b/3rdParty/gslib.github/src/findpt.h deleted file mode 100644 index 0aee5edd2..000000000 --- a/3rdParty/gslib.github/src/findpt.h +++ /dev/null @@ -1,87 +0,0 @@ -#ifndef FINDPT_H -#define FINDPT_H - -/* requires "types.h", "poly.h", "tensor.h" */ -#if !defined(TYPES_H) || !defined(POLY_H) || !defined(TENSOR_H) -#warning "findpt.h" requires "types.h", "poly.h", "tensor.h" -#endif - -typedef struct { - const real *xw[2]; /* geometry data */ - real *z[2]; /* lobatto nodes */ - lagrange_data ld[2]; /* interpolation, derivative weights & data */ - unsigned nptel; /* nodes per element */ - struct findpt_hash_data_2 *hash; /* geometric hashing data */ - struct findpt_listel *list, **sorted, **end; - /* pre-allocated list of elements to - check (found by hashing), and - pre-allocated list of pointers into - the first list for sorting */ - struct findpt_opt_data_2 *od; /* data for the optimization algorithm */ - real *od_work; -} findpt_data_2; - -typedef struct { - const real *xw[3]; /* geometry data */ - real *z[3]; /* lobatto nodes */ - lagrange_data ld[3]; /* interpolation, derivative weights & data */ - unsigned nptel; /* nodes per element */ - struct findpt_hash_data_3 *hash; /* geometric hashing data */ - struct findpt_listel *list, **sorted, **end; - /* pre-allocated list of elements to - check (found by hashing), and - pre-allocated list of pointers into - the first list for sorting */ - struct findpt_opt_data_3 *od; /* data for the optimization algorithm */ - real *od_work; -} findpt_data_3; - -findpt_data_2 *findpt_setup_2( - const real *const xw[2], const unsigned n[2], uint nel, - uint max_hash_size, real bbox_tol); -findpt_data_3 *findpt_setup_3( - const real *const xw[3], const unsigned n[3], uint nel, - uint max_hash_size, real bbox_tol); - -void findpt_free_2(findpt_data_2 *p); -void findpt_free_3(findpt_data_3 *p); - -const real *findpt_allbnd_2(const findpt_data_2 *p); -const real *findpt_allbnd_3(const findpt_data_3 *p); - -typedef int (*findpt_func)(void *, const real *, int, uint *, real *, real *); -int findpt_2(findpt_data_2 *p, const real x[2], int guess, - uint *el, real r[2], real *dist); -int findpt_3(findpt_data_3 *p, const real x[3], int guess, - uint *el, real r[3], real *dist); - -static void findpt_weights_2(findpt_data_2 *p, const real r[2]) -{ - lagrange_0(&p->ld[0],r[0]); - lagrange_0(&p->ld[1],r[1]); -} - -static void findpt_weights_3(findpt_data_3 *p, const real r[3]) -{ - lagrange_0(&p->ld[0],r[0]); - lagrange_0(&p->ld[1],r[1]); - lagrange_0(&p->ld[2],r[2]); -} - -static double findpt_eval_2(findpt_data_2 *p, const real *u) -{ - return tensor_i2(p->ld[0].J,p->ld[0].n, - p->ld[1].J,p->ld[1].n, - u, p->od_work); -} - -static double findpt_eval_3(findpt_data_3 *p, const real *u) -{ - return tensor_i3(p->ld[0].J,p->ld[0].n, - p->ld[1].J,p->ld[1].n, - p->ld[2].J,p->ld[2].n, - u, p->od_work); -} - -#endif - diff --git a/3rdParty/gslib.github/src/findpt2.c b/3rdParty/gslib.github/src/findpt2.c deleted file mode 100644 index 00cd73441..000000000 --- a/3rdParty/gslib.github/src/findpt2.c +++ /dev/null @@ -1,2245 +0,0 @@ -#include -#include -#include -#include /* for cos, fabs */ -#include -#include /* for memcpy */ - -#include "errmem.h" -#include "types.h" -#include "minmax.h" -#include "poly.h" -#include "tensor.h" - -/*-------------------------------------------------------------------------- - Lobatto Polynomial Bounds - - Needed inputs are the Gauss-Lobatto quadrature nodes and weights: - unsigned nr = ..., ns = ...; - real zr[nr], wr[nr]; - real zs[ns], ws[ns]; - - lobatto_nodes(zr,nr); lobatto_weights(zr,wr,nr); - lobatto_nodes(zs,ns); lobatto_weights(zs,ws,ns); - - The number of points in the constructed piecewise (bi-)linear bounds - is a parameter; more points give tighter bounds - - unsigned mr = 2*nr, ms = 2*ns; - - The necessary setup is accomplished via: - lob_bnd_base b_data_r; - lob_bnd_ext e_data_s; - - lob_bnd_base_alloc(&b_data_r,nr,mr); - lob_bnd_base_setup(&b_data_r,zr,wr); - lob_bnd_ext_alloc(&e_data_s,ns,ms); - lob_bnd_ext_setup(&e_data_s,zs,ws); - - Bounds may then be computed via: - real work1r[2*mr], work1s[2*ms], work2[2*mr + 2*mr*ns + 2*mr*ms]; - real ur[nr], us[ns]; // 1-d polynomials on the zr[] and zs[] nodes - real u[ns][nr]; // 2-d polynomial on zr[] (x) zs[] - real bound[2]; // = { min, max } (to be computed) - - lob_bnd_1(&b_data_r ,ur,bound,work1r); // compute bounds on ur - lob_bnd_1(&e_data_s.b,us,bound,work1s); // compute bounds on us - lob_bnd_2(&b_data_r, &e_data_s, - (const double*)&u[0][0],bound,work2); // compute bounds on u - The above routines access the zr,zs arrays passed to *_setup - (so do not delete them between calls) - - Memory allocated in *_setup is freed with - lob_bnd_base_free(&b_data_r); - lob_bnd_ext_free(&e_data_s); - - --------------------------------------------------------------------------*/ - -typedef struct { - unsigned n; /* number of Lobatto nodes in input */ - unsigned m; /* number of Chebyshev nodes used to calculate bounds */ - real *Q0, *Q1; /* Q0[n], Q1[n] -- first two rows of change of basis matrix - from Lobatto node Lagrangian to Legendre */ - const real *z; /* z[n] -- external; Lobatto nodes */ - real *h; /* h[m] -- Chebyshev nodes */ - real *uv, *ov; /* uv[n][m], ov[n][m] -- - uv[j][:] is a piecewise linear function in the nodal - basis with nodes h[m] that is everywhere less - than or equal to the jth Lagrangian basis - function (with the Lobatto nodes z[n]) - ov[j][:] is everywhere greater than or equal */ -} lob_bnd_base; - -typedef struct { - lob_bnd_base b; - real *uvp, *uvn, *ovp, *ovn; /* [n][m] -- uv and ov split into - positive and negative parts */ -} lob_bnd_ext; - -static void lob_bnd_base_alloc(lob_bnd_base *p, unsigned n, unsigned m) -{ - p->n = n, p->m = m; - p->Q0 = tmalloc(real,2*n+m+2*n*m); - p->Q1 = p->Q0+n; - p->h = p->Q1+n; - p->uv = p->h +m; - p->ov = p->uv+n*m; -} - -static void lob_bnd_base_free(lob_bnd_base *p) -{ - free(p->Q0); -} - -static void lob_bnd_ext_alloc(lob_bnd_ext *p, unsigned n, unsigned m) -{ - p->b.n = n, p->b.m = m; - p->b.Q0 = tmalloc(real,2*n+m+6*n*m); - p->b.Q1 = p->b.Q0+n; - p->b.h = p->b.Q1+n; - p->b.uv = p->b.h +m; - p->b.ov = p->b.uv+n*m; - p->uvp = p->b.ov+n*m; - p->uvn = p->uvp +n*m; - p->ovp = p->uvn +n*m; - p->ovn = p->ovp +n*m; -} - -static void lob_bnd_ext_free(lob_bnd_ext *p) -{ - free(p->b.Q0); -} - -static void lob_bnd_base_setup(lob_bnd_base *p, const real *z, const real *w) -{ - unsigned i,j,m=p->m,n=p->n,mm=2*m-1; - real *q = tmalloc(real,(2*n+1)*mm+6*n), - *J = q+mm, *D = J+n*mm, *work = D+n*mm; - p->z = z; - for(i=0;iQ0[i]=w[i]/2, p->Q1[i] = 3*p->Q0[i]*z[i]; - p->h[0] = -1, p->h[m-1] = 1; - for(j=1;jh[j] = cosr((m-j-1)*PI/(m-1)); - for(j=0;jh[j], q[2*j+1] = (p->h[j]+p->h[j+1])/2; - q[mm-1] = p->h[m-1]; - lagrange_weights_deriv(z,n,q,mm,J,D,work); - for(i=0;iuv+i*m, *ov = p->ov+i*m; - ov[0] = uv[0] = J[i]; - ov[m-1] = uv[m-1] = J[(mm-1)*n+i]; - for(j=1;jb.m * p->b.n; - lob_bnd_base_setup(&p->b,z,w); - for(i=0;ib.uv[i], ovi = p->b.ov[i]; - p->uvp[i] = p->uvn[i] = p->ovp[i] = p->ovn[i] = 0; - if(uvi > 0) p->uvp[i]=uvi; else p->uvn[i]=uvi; - if(ovi > 0) p->ovp[i]=ovi; else p->ovn[i]=ovi; - } -} - -static void lob_bnd_lines(const lob_bnd_base *p, const real *u, - real *a, real *b) -{ - unsigned i,j; - real a0=0, a1=0; - const real *uv = p->uv, *ov = p->ov; - for(i=0;in;++i) a0 += p->Q0[i]*u[i], a1 += p->Q1[i]*u[i]; - for(j=0;jm;++j) b[j] = a[j] = a0 + a1*p->h[j]; - for(i=0;in;++i) { - real w = u[i] - (a0 + a1*p->z[i]); - if(w>=0) - for(j=0;jm;++j) a[j]+=w*(*uv++), b[j]+=w*(*ov++); - else - for(j=0;jm;++j) a[j]+=w*(*ov++), b[j]+=w*(*uv++); - } -} - -/* work holds p->m * 2 doubles */ -static void lob_bnd_1(const lob_bnd_base *p, const real *u, real bnd[2], - real *work) -{ - unsigned j; - real *a = work, *b = work+p->m; - lob_bnd_lines(p,u,a,b); - bnd[0] = a[0], bnd[1] = b[0]; - for(j=1;jm;++j) { - if(a[j]bnd[1]) bnd[1]=b[j]; - } -} - -/* work holds 2*mr + 2*mr*ns + 2*mr*ms doubles */ -static void lob_bnd_2(const lob_bnd_base *pr, const lob_bnd_ext *ps, - const real *u, real bnd[2], real *work) -{ - unsigned nr = pr->n, mr = pr->m, ns = ps->b.n, ms = ps->b.m; - real *a0 = work, *a1 = a0+mr, - *ar_= a1+mr, *ar=ar_, - *br_= ar+mr*ns, *br=br_, - *a_ = br+mr*ns, *a =a_, - *b_ = a +mr*ms, *b =b_, - *uvp,*ovp,*uvn,*ovn; - real b0,b1; - unsigned i,j,k; - for(i=0;ib.Q0[j], q1 = ps->b.Q1[j]; - lob_bnd_lines(pr,u,ar,br); - for(i=0;ib.h[k]; - } - ar = ar_, br = br_; - uvp=ps->uvp, ovp=ps->ovp, uvn=ps->uvn, ovn=ps->ovn; - for(j=0;jb.z[j]; - a = a_, b = b_; - for(i=0;i=0) /* 0 <= uw <= ow */ - for(k=0;kb1) b1=b_[i]; - } - bnd[0] = b0, bnd[1] = b1; -} - -/*-------------------------------------------------------------------------- - Small Matrix Inverse - --------------------------------------------------------------------------*/ - -static void mat_inv_2(const real A[4], real inv[4]) -{ - const real idet = 1/(A[0]*A[3]-A[1]*A[2]); - inv[0] = idet*A[3]; - inv[1] = -(idet*A[1]); - inv[2] = -(idet*A[2]); - inv[3] = idet*A[0]; -} - -static void mat_inv_3(const real A[9], real inv[9]) -{ - const real a = A[4]*A[8]-A[5]*A[7], - b = A[5]*A[6]-A[3]*A[8], - c = A[3]*A[7]-A[4]*A[6], - idet = 1/(A[0]*a+A[1]*b+A[2]*c); - inv[0] = idet*a; - inv[1] = idet*(A[2]*A[7]-A[1]*A[8]); - inv[2] = idet*(A[1]*A[5]-A[2]*A[4]); - inv[3] = idet*b; - inv[4] = idet*(A[0]*A[8]-A[2]*A[6]); - inv[5] = idet*(A[2]*A[3]-A[0]*A[5]); - inv[6] = idet*c; - inv[7] = idet*(A[1]*A[6]-A[0]*A[7]); - inv[8] = idet*(A[0]*A[4]-A[1]*A[3]); -} - -static void mat_app_2r(real y[2], const real A[4], const real x[2]) -{ - y[0] = A[0]*x[0] + A[1]*x[1]; - y[1] = A[2]*x[0] + A[3]*x[1]; -} - -static void mat_app_2c(real y[2], const real A[4], const real x[2]) -{ - y[0] = A[0]*x[0] + A[2]*x[1]; - y[1] = A[1]*x[0] + A[3]*x[1]; -} - -static void mat_app_3r(real y[3], const real A[9], const real x[3]) -{ - y[0] = A[0]*x[0] + A[1]*x[1] + A[2]*x[2]; - y[1] = A[3]*x[0] + A[4]*x[1] + A[5]*x[2]; - y[2] = A[6]*x[0] + A[7]*x[1] + A[8]*x[2]; -} - -static void mat_app_3c(real y[3], const real A[9], const real x[3]) -{ - y[0] = A[0]*x[0] + A[3]*x[1] + A[6]*x[2]; - y[1] = A[1]*x[0] + A[4]*x[1] + A[7]*x[2]; - y[2] = A[2]*x[0] + A[5]*x[1] + A[8]*x[2]; -} - -static void tinyla_solve_2(real x[2], const real A[4], const real b[2]) -{ - real inv[4]; - mat_inv_2(A,inv); - mat_app_2r(x,inv,b); -} - -static void tinyla_solve_3(real x[3], const real A[9], const real b[3]) -{ - real inv[9]; - mat_inv_3(A,inv); - mat_app_3r(x,inv,b); -} - -/* solve - A[0] x0 + A[2] x1 = b0, - A[2] x0 + A[1] x1 = b1 -*/ -static void tinyla_solve_sym_2(real *x0, real *x1, const real A[3], - real b0, real b1) -{ - const real idet = 1/(A[0]*A[1] - A[2]*A[2]); - *x0 = idet * (A[1]*b0 - A[2]*b1); - *x1 = idet * (A[0]*b1 - A[2]*b0); -} - -/*-------------------------------------------------------------------------- - Oriented Bounding Box - - Suffixes on names are _2 for 2-d and _3 for 3-d - - Needed inputs are the Gauss-Lobatto quadrature nodes and weights: - unsigned nr = ..., ns = ...; - real zr[nr], wr[nr]; - real zs[ns], ws[ns]; - - lobatto_nodes(zr,nr); lobatto_weights(zr,wr,nr); - lobatto_nodes(zs,ns); lobatto_weights(zs,ws,ns); - - The number of points in the constructed piecewise (bi-)linear bounds - for the boundaries is a parameter; more points give tighter bounds - - unsigned mr = 2*nr, ms = 2*ns; - - Bounding boxes are increased by a relative amount as a parameter - - real tol = 0.01; - - Setup is accomplished via: - - const real *z[2] = {zr,zs}, *w[2] = {wr,ws}; - const unsigned n[2] = {nr,ns}, m[2] = {mr,ms}; - obbox_data_2 *data = obbox_setup_2(z,w,n,m); - - Bounding box data may then be computed: - - obbox_2 box; // will store bounding box information - real xm[ns][nr], ym[ns][nr]; // x, y coordinates of the element nodes - - obbox_calc_2(data, tol, (const real *)&xm[0][0], - (const real *)&ym[0][0], &box); - - A point may be tested: - - const real x[2]; // point to test - real r[2]; - - if( obbox_axis_test_2(&box, x) ) - ... // x failed axis-aligned bounding box test - - if( obbox_test_2(&box, x, r) ) - ... // x failed oriented bounding box test - else - ... // r suitable as initial guess for parametric coords - - Once all bounding box information has been computed - - obbox_free_2(data); - - to free the memory allocated with obbox_setup_2. - - --------------------------------------------------------------------------*/ - -typedef struct { - lob_bnd_base dr, ds; - real *Jr0, *Dr0, *Js0, *Ds0, *work; -} obbox_data_2; - -typedef struct { - lob_bnd_base dr; - lob_bnd_ext ds, dt; - real *Jr0, *Dr0, *Js0, *Ds0, *Jt0, *Dt0, *work; -} obbox_data_3; - -static void obbox_data_alloc_2(obbox_data_2 *p, - const unsigned n[2], const unsigned m[2]) -{ - const unsigned max_npm = umax_2(n[0]+m[0],n[1]+m[1]); - lob_bnd_base_alloc(&p->dr, n[0], m[0]); - lob_bnd_base_alloc(&p->ds, n[1], m[1]); - p->Jr0 = tmalloc(real,2*n[0]+2*n[1]+2*max_npm); - p->Dr0 = p->Jr0 + n[0]; - p->Js0 = p->Dr0 + n[0]; - p->Ds0 = p->Js0 + n[1]; - p->work = p->Ds0 + n[1]; -} - -static void obbox_data_free_2(obbox_data_2 *p) -{ - lob_bnd_base_free(&p->dr); - lob_bnd_base_free(&p->ds); - free(p->Jr0); -} - -static void obbox_data_alloc_3(obbox_data_3 *p, - const unsigned n[3], const unsigned m[3]) -{ - const unsigned wk1 = 3*n[0]*n[1] + 2*m[0] + 2*m[0]*n[1] + 2*m[0]*m[1]; - const unsigned wk2 = 3*n[0]*n[2] + 2*m[0] + 2*m[0]*n[2] + 2*m[0]*m[2]; - const unsigned wk3 = 3*n[1]*n[2] + 2*m[1] + 2*m[1]*n[2] + 2*m[1]*m[2]; - const unsigned wk_max = umax_3(wk1,wk2,wk3); - lob_bnd_base_alloc(&p->dr, n[0], m[0]); - lob_bnd_ext_alloc(&p->ds, n[1], m[1]); - lob_bnd_ext_alloc(&p->dt, n[2], m[2]); - p->Jr0 = tmalloc(real,2*n[0]+2*n[1]+2*n[2] + wk_max); - p->Dr0 = p->Jr0 + n[0]; - p->Js0 = p->Dr0 + n[0]; - p->Ds0 = p->Js0 + n[1]; - p->Jt0 = p->Ds0 + n[1]; - p->Dt0 = p->Jt0 + n[2]; - p->work = p->Dt0 + n[2]; -} - -static void obbox_data_free_3(obbox_data_3 *p) -{ - lob_bnd_base_free(&p->dr); - lob_bnd_ext_free(&p->ds); - lob_bnd_ext_free(&p->dt); - free(p->Jr0); -} - -static obbox_data_2 *obbox_setup_2(const real *const z[2], - const real *const w[2], - const unsigned n[2], const unsigned m[2]) -{ - const real zero = 0; - real *work; - obbox_data_2 *p = tmalloc(obbox_data_2,1); - obbox_data_alloc_2(p,n,m); - lob_bnd_base_setup(&p->dr,z[0],w[0]); - lob_bnd_base_setup(&p->ds,z[1],w[1]); - work = tmalloc(real,6*umax_2(n[0],n[1])); - lagrange_weights_deriv(z[0],n[0],&zero,1,p->Jr0,p->Dr0,work); - lagrange_weights_deriv(z[1],n[1],&zero,1,p->Js0,p->Ds0,work); - free(work); - return p; -} - -static obbox_data_3 *obbox_setup_3(const real *const z[3], - const real *const w[3], - const unsigned n[3], const unsigned m[3]) -{ - const real zero = 0; - real *work; - obbox_data_3 *p = tmalloc(obbox_data_3,1); - obbox_data_alloc_3(p,n,m); - lob_bnd_base_setup(&p->dr,z[0],w[0]); - lob_bnd_ext_setup(&p->ds,z[1],w[1]); - lob_bnd_ext_setup(&p->dt,z[2],w[2]); - work = tmalloc(real,6*umax_3(n[0],n[1],n[2])); - lagrange_weights_deriv(z[0],n[0],&zero,1,p->Jr0,p->Dr0,work); - lagrange_weights_deriv(z[1],n[1],&zero,1,p->Js0,p->Ds0,work); - lagrange_weights_deriv(z[2],n[2],&zero,1,p->Jt0,p->Dt0,work); - free(work); - return p; -} - -static void obbox_free_2(obbox_data_2 *p) -{ - obbox_data_free_2(p); - free(p); -} - -static void obbox_free_3(obbox_data_3 *p) -{ - obbox_data_free_3(p); - free(p); -} - -typedef struct { - real x[2], A[4], axis_bnd[4]; -} obbox_2; - -typedef struct { - real x[3], A[9], axis_bnd[6]; -} obbox_3; - -static int obbox_axis_test_2(const obbox_2 *p, const real x[2]) -{ - return (x[0]axis_bnd[0] || x[0]>p->axis_bnd[1] || - x[1]axis_bnd[2] || x[1]>p->axis_bnd[3]); -} - -static int obbox_axis_test_3(const obbox_3 *p, const real x[3]) -{ - return (x[0]axis_bnd[0] || x[0]>p->axis_bnd[1] || - x[1]axis_bnd[2] || x[1]>p->axis_bnd[3] || - x[2]axis_bnd[4] || x[2]>p->axis_bnd[5]); -} - -static int obbox_test_2(const obbox_2 *p, const real x[2], real r[2]) -{ - const real xt[2] = {x[0]-p->x[0],x[1]-p->x[1]}; - r[0] = p->A[0]*xt[0] + p->A[1]*xt[1]; - if(fabsr(r[0])>1) return 1; - r[1] = p->A[2]*xt[0] + p->A[3]*xt[1]; - return fabsr(r[1])>1; -} - -static int obbox_test_3(const obbox_3 *p, const real x[3], real r[3]) -{ - const real xt[3] = {x[0]-p->x[0],x[1]-p->x[1],x[2]-p->x[2]}; - r[0] = p->A[0]*xt[0] + p->A[1]*xt[1] + p->A[2]*xt[2]; - if(fabsr(r[0])>1) return 1; - r[1] = p->A[3]*xt[0] + p->A[4]*xt[1] + p->A[5]*xt[2]; - if(fabsr(r[1])>1) return 1; - r[2] = p->A[6]*xt[0] + p->A[7]*xt[1] + p->A[8]*xt[2]; - return fabsr(r[2])>1; -} - -static void obbox_calc_tfm_2(const real *x, const real *y, - unsigned n, unsigned s, - const real c0[2], const real A[4], real *u) -{ - unsigned i; - real *v = u+n; - for(i=0; ib[1]) b[1]=ob[1]; - if(ob[2]b[3]) b[3]=ob[3]; -} - -static void obbox_merge_3(real *b, const real *ob) -{ - if(ob[0]b[1]) b[1]=ob[1]; - if(ob[2]b[3]) b[3]=ob[3]; - if(ob[4]b[5]) b[5]=ob[5]; -} - -/* work holds 2*n + 2*m reals */ -static void obbox_side_2(const real *x, const real *y, - unsigned n, unsigned s, - const real c0[2], const real A[4], real *work, - const lob_bnd_base *lbd, real bnd[4]) -{ - obbox_calc_tfm_2(x,y,n,s,c0,A,work); - lob_bnd_1(lbd,work ,bnd ,work+2*n); - lob_bnd_1(lbd,work+n,bnd+2,work+2*n); -} - -/* work holds 3*nr*ns + 2*mr + 2*mr*ns + 2*mr*ms reals */ -static void obbox_side_3(const real *x, const real *y, const real *z, - unsigned nr, unsigned sr, unsigned ns, unsigned ss, - const real c0[3], const real A[9], real *work, - const lob_bnd_base *dr, const lob_bnd_ext *ds, - real bnd[6]) -{ - obbox_calc_tfm_3(x,y,z,nr,sr,ns,ss,c0,A,work); - lob_bnd_2(dr,ds,work ,bnd ,work+3*nr*ns); - lob_bnd_2(dr,ds,work+ nr*ns,bnd+2,work+3*nr*ns); - lob_bnd_2(dr,ds,work+2*nr*ns,bnd+4,work+3*nr*ns); -} - -/* return bounds on u = A (x - c0) - bnd[0] <= u_0 <= bnd[1] - bnd[2] <= u_1 <= bnd[3] */ -static void obbox_bnd_2(const obbox_data_2 *p, - const real *x, const real *y, - const real c0[2], const real A[4], - real bnd[4]) -{ - unsigned i, nr = p->dr.n, ns = p->ds.n; - real obnd[4]; - - i = nr*(ns-1); - obbox_side_2(x ,y , nr, 1, c0,A,p->work, &p->dr, bnd); - obbox_side_2(x+i,y+i, nr, 1, c0,A,p->work, &p->dr, obnd); - obbox_merge_2(bnd,obnd); - - i = nr-1; - obbox_side_2(x ,y , ns,nr, c0,A,p->work, &p->ds, obnd); - obbox_merge_2(bnd,obnd); - obbox_side_2(x+i,y+i, nr,nr, c0,A,p->work, &p->ds, obnd); - obbox_merge_2(bnd,obnd); -} - -/* return bounds on u = A (x - c0) - bnd[0] <= u_0 <= bnd[1] - bnd[2] <= u_1 <= bnd[3] - bnd[4] <= u_2 <= bnd[5] */ -static void obbox_bnd_3(const obbox_data_3 *p, - const real *x, const real *y, const real *z, - const real c0[3], const real A[9], - real bnd[6]) -{ - unsigned i, nr = p->dr.n, ns = p->ds.b.n, nt = p->dt.b.n; - real obnd[6]; - - i = nr*ns*(nt-1); - obbox_side_3(x ,y ,z , nr, 1,ns,0, c0,A,p->work, &p->dr ,&p->ds, bnd); - obbox_side_3(x+i,y+i,z+i, nr, 1,ns,0, c0,A,p->work, &p->dr ,&p->ds, obnd); - obbox_merge_3(bnd,obnd); - - i = nr*(ns-1); - obbox_side_3(x ,y ,z , nr, 1,nt,i, c0,A,p->work, &p->dr ,&p->dt, obnd); - obbox_merge_3(bnd,obnd); - obbox_side_3(x+i,y+i,z+i, nr, 1,nt,i, c0,A,p->work, &p->dr ,&p->dt, obnd); - obbox_merge_3(bnd,obnd); - - i = nr-1; - obbox_side_3(x ,y ,z , ns,nr,nt,0, c0,A,p->work, &p->ds.b,&p->dt, obnd); - obbox_merge_3(bnd,obnd); - obbox_side_3(x+i,y+i,z+i, ns,nr,nt,0, c0,A,p->work, &p->ds.b,&p->dt, obnd); - obbox_merge_3(bnd,obnd); -} - -static void obbox_calc_2(const obbox_data_2 *p, real tol, - const real *x, const real *y, obbox_2 *b) -{ - const real zero[2] = {0,0}, id[4] = {1,0,0,1}; - real c0[2], jac[4], inv[4], bnd[4], u0[2], d[2]; - - obbox_bnd_2(p,x,y,zero,id,b->axis_bnd); - d[0] = b->axis_bnd[1]-b->axis_bnd[0]; - d[1] = b->axis_bnd[3]-b->axis_bnd[2]; - b->axis_bnd[0] -= tol*d[0], b->axis_bnd[1] += tol*d[0]; - b->axis_bnd[2] -= tol*d[1], b->axis_bnd[3] += tol*d[1]; - - c0[0] = tensor_ig2(p->Jr0,p->Dr0,p->dr.n, - p->Js0,p->Ds0,p->ds.n, - x, jac , p->work); - c0[1] = tensor_ig2(p->Jr0,p->Dr0,p->dr.n, - p->Js0,p->Ds0,p->ds.n, - y, jac+2, p->work); - mat_inv_2(jac,inv); - - obbox_bnd_2(p,x,y,c0,inv,bnd); - - u0[0] = (bnd[0]+bnd[1])/2; - u0[1] = (bnd[2]+bnd[3])/2; - d[0] = 2/((1+tol)*(bnd[1]-bnd[0])); - d[1] = 2/((1+tol)*(bnd[3]-bnd[2])); - b->x[0] = c0[0] + jac[0]*u0[0] + jac[1]*u0[1]; - b->x[1] = c0[1] + jac[2]*u0[0] + jac[3]*u0[1]; - b->A[0] = d[0]*inv[0], b->A[1] = d[0]*inv[1]; - b->A[2] = d[1]*inv[2], b->A[3] = d[1]*inv[3]; -} - -static void obbox_calc_3(const obbox_data_3 *p, real tol, - const real *x, const real *y, const real *z, - obbox_3 *b) -{ - const real zero[3] = {0,0}, id[9] = {1,0,0,0,1,0,0,0,1}; - real c0[3], jac[9], inv[9], bnd[6], u0[3], d[3]; - - obbox_bnd_3(p,x,y,z,zero,id,b->axis_bnd); - d[0] = b->axis_bnd[1]-b->axis_bnd[0]; - d[1] = b->axis_bnd[3]-b->axis_bnd[2]; - d[2] = b->axis_bnd[5]-b->axis_bnd[4]; - b->axis_bnd[0] -= tol*d[0], b->axis_bnd[1] += tol*d[0]; - b->axis_bnd[2] -= tol*d[1], b->axis_bnd[3] += tol*d[1]; - b->axis_bnd[4] -= tol*d[2], b->axis_bnd[5] += tol*d[2]; - - c0[0] = tensor_ig3(p->Jr0,p->Dr0,p->dr.n, - p->Js0,p->Ds0,p->ds.b.n, - p->Jt0,p->Dt0,p->dt.b.n, - x, jac , p->work); - c0[1] = tensor_ig3(p->Jr0,p->Dr0,p->dr.n, - p->Js0,p->Ds0,p->ds.b.n, - p->Jt0,p->Dt0,p->dt.b.n, - y, jac+3, p->work); - c0[2] = tensor_ig3(p->Jr0,p->Dr0,p->dr.n, - p->Js0,p->Ds0,p->ds.b.n, - p->Jt0,p->Dt0,p->dt.b.n, - z, jac+6, p->work); - mat_inv_3(jac,inv); - - obbox_bnd_3(p,x,y,z,c0,inv,bnd); - - u0[0] = (bnd[0]+bnd[1])/2; - u0[1] = (bnd[2]+bnd[3])/2; - u0[2] = (bnd[4]+bnd[5])/2; - d[0] = 2/((1+tol)*(bnd[1]-bnd[0])); - d[1] = 2/((1+tol)*(bnd[3]-bnd[2])); - d[2] = 2/((1+tol)*(bnd[5]-bnd[4])); - b->x[0] = c0[0] + jac[0]*u0[0] + jac[1]*u0[1] + jac[2]*u0[2]; - b->x[1] = c0[1] + jac[3]*u0[0] + jac[4]*u0[1] + jac[5]*u0[2]; - b->x[2] = c0[2] + jac[6]*u0[0] + jac[7]*u0[1] + jac[8]*u0[2]; - b->A[0] = d[0]*inv[0], b->A[1] = d[0]*inv[1], b->A[2] = d[0]*inv[2]; - b->A[3] = d[1]*inv[3], b->A[4] = d[1]*inv[4], b->A[5] = d[1]*inv[5]; - b->A[6] = d[2]*inv[6], b->A[7] = d[2]*inv[7], b->A[8] = d[2]*inv[8]; -} - -/*-------------------------------------------------------------------------- - Point to Possible Elements Hashing - - Initializing the data: - unsigned nel; // number of elements - const unsigned n[3]; // number of nodes in r, s, t directions - const real *xm[3]; // n[0]*n[1]*n[2]*nel x,y,z coordinates - real tol = 0.01; // how far point is allowed to be outside element - // relative to element size - unsigned max_size = n[0]*n[1]*n[2]*nel; // maximum size of hash table - - hash_data_3 data; - hash_build_3(&data, xm, n, nel, max_size, tol); - - Using the data: - real x[3]; // point to find - - unsigned index = hash_index_3(&data, x); - unsigned i, b = data.offset[index], e = data.offset[index+1]; - - // point may be in elements - // data.offset[b], data.offset[b+1], ... , data.offset[e-1] - // - // list has maximum size data.max (e.g., e-b <= data.max) - - for(i=b; i!=e; ++i) { - unsigned el = data.offset[i]; - const obbox_3 *obb = &data.obb[el]; // bounding box data for element el - ... - } - - When done: - hash_free_3(&data); - - --------------------------------------------------------------------------*/ - -typedef struct { - unsigned hash_n; - real bnd[4]; /* bounds for all elements */ - real fac[2]; /* fac[i] = hash_n / (bnd[2*i+1]-bnd[2*i]) */ - obbox_2 *obb; /* obb[nel] -- bounding box info for each element */ - uint *offset; /* hash table -- for cell i,j: - uint index = j*hash_n+i, - b = offset[index ], - e = offset[index+1]; - elements in cell are - offset[b], offset[b+1], ..., offset[e-1] */ - unsigned max; /* maximum # of elements in any cell */ -} hash_data_2; - -typedef struct { - unsigned hash_n; - real bnd[6]; /* bounds for all elements */ - real fac[3]; /* fac[i] = hash_n / (bnd[2*i+1]-bnd[2*i]) */ - obbox_3 *obb; /* obb[nel] -- bounding box info for each element */ - uint *offset; /* hash table -- for cell i,j,k: - uint index = (k*hash_n+j)*hash_n+i, - b = offset[index ], - e = offset[index+1]; - elements in cell are - offset[b], offset[b+1], ..., offset[e-1] */ - unsigned max; /* maximum # of elements in any cell */ -} hash_data_3; - -static int ifloor(real x) -{ - /* - int y = x; - return (double)y > x ? y-1 : y; - */ - return floorr(x); -} - -static int iceil(real x) -{ - /* - int y = x; - return (double)y < x ? y+1 : y; - */ - return ceilr(x); -} - -static unsigned hash_index_helper(real low, real fac, unsigned n, real x) -{ - const int i = ifloor((x-low)*fac); - if(i<0) return 0; - return umin_2(i,n-1); -} - -static uint hash_index_2(const hash_data_2 *p, const real x[2]) -{ - const unsigned n = p->hash_n; - return (uint)hash_index_helper(p->bnd[2],p->fac[1],n,x[1])*n - +hash_index_helper(p->bnd[0],p->fac[0],n,x[0]); -} - -static uint hash_index_3(const hash_data_3 *p, const real x[3]) -{ - const unsigned n = p->hash_n; - return ( (uint)hash_index_helper(p->bnd[4],p->fac[2],n,x[2]) *n - +hash_index_helper(p->bnd[2],p->fac[1],n,x[1]) )*n - +hash_index_helper(p->bnd[0],p->fac[0],n,x[0]); -} - -static void hash_setfac_2(hash_data_2 *p, unsigned n) -{ - p->hash_n = n; - p->fac[0] = n/(p->bnd[1] - p->bnd[0]); - p->fac[1] = n/(p->bnd[3] - p->bnd[2]); -} - -static void hash_setfac_3(hash_data_3 *p, unsigned n) -{ - p->hash_n = n; - p->fac[0] = n/(p->bnd[1] - p->bnd[0]); - p->fac[1] = n/(p->bnd[3] - p->bnd[2]); - p->fac[2] = n/(p->bnd[5] - p->bnd[4]); -} - -static void hash_range_2(const hash_data_2 *p, uint i, unsigned d, - unsigned *ia, unsigned *ib) -{ - const real a = p->obb[i].axis_bnd[d*2 ]; - const real b = p->obb[i].axis_bnd[d*2+1]; - const int i0 = ifloor( (a - p->bnd[d*2]) * p->fac[d] ); - const unsigned i1 = iceil( (b - p->bnd[d*2]) * p->fac[d] ); - *ia = imax_2(0,i0); - *ib = imin_2(i1,p->hash_n); - if(*ib == *ia) ++(*ib); -} - -static void hash_range_3(const hash_data_3 *p, uint i, unsigned d, - unsigned *ia, unsigned *ib) -{ - const real a = p->obb[i].axis_bnd[d*2 ]; - const real b = p->obb[i].axis_bnd[d*2+1]; - const int i0 = ifloor( (a - p->bnd[d*2]) * p->fac[d] ); - const unsigned i1 = iceil( (b - p->bnd[d*2]) * p->fac[d] ); - *ia = imax_2(0,i0); - *ib = imin_2(i1,p->hash_n); - if(*ib == *ia) ++(*ib); -} - -static uint hash_count_2(hash_data_2 *p, uint nel, unsigned n) -{ - uint i,count=0; - hash_setfac_2(p,n); - for(i=0;i1) { - unsigned nm = nl+(nu-nl)/2; - uint size = (uint)nm*nm+1+hash_count_2(p,nel,nm); - if(size<=max_size) nl=nm,size_low=size; else nu=nm; - } - hash_setfac_2(p,nl); - return size_low; -} - -static uint hash_opt_size_3(hash_data_3 *p, uint nel, uint max_size) -{ - unsigned nl=1, nu = ceil(pow(max_size-nel,1.0/3)); - uint size_low = 2+nel; - while(nu-nl>1) { - unsigned nm = nl+(nu-nl)/2; - uint size = (uint)nm*nm*nm+1+hash_count_3(p,nel,nm); - if(size<=max_size) nl=nm,size_low=size; else nu=nm; - } - hash_setfac_3(p,nl); - return size_low; -} - -static void hash_getbb_2(hash_data_2 *p, const real *const elx[2], - const unsigned n[2], uint nel, real tol) -{ - obbox_data_2 *data; - const real *x[2]={elx[0],elx[1]}; - real *z[2], *w[2]; - uint i; unsigned d; - const unsigned nn = n[0]*n[1], m[2] = {2*n[0],2*n[1]}; - - z[0] = tmalloc(real,2*(n[0]+n[1])); - w[0] = z[0] + n[0]; - z[1] = w[0] + n[0], w[1] = z[1] + n[1]; - for(d=0;d<2;++d) - lobatto_nodes(z[d],n[d]), lobatto_weights(z[d],w[d],n[d]); - data = obbox_setup_2((const real *const*)z,(const real *const*)w,n,m); - obbox_calc_2(data,tol,x[0],x[1],&p->obb[0]); - memcpy(&p->bnd[0],(const real*)&p->obb[0].axis_bnd[0],4*sizeof(real)); - for(i=0;iobb[i]); - obbox_merge_2(&p->bnd[0],(const real*)&p->obb[i].axis_bnd[0]); - } - obbox_free_2(data); - free(z[0]); -} - -static void hash_getbb_3(hash_data_3 *p, const real *const elx[3], - const unsigned n[3], uint nel, real tol) -{ - obbox_data_3 *data; - const real *x[3]={elx[0],elx[1],elx[2]}; - real *z[3], *w[3]; - uint i; unsigned d; - const unsigned nn = n[0]*n[1]*n[2], m[3] = {2*n[0],2*n[1],2*n[2]}; - - z[0] = tmalloc(real,2*(n[0]+n[1]+n[2])); - w[0] = z[0] + n[0]; - for(d=1;d<3;++d) z[d]=w[d-1]+n[d-1], w[d]=z[d]+n[d]; - for(d=0;d<3;++d) - lobatto_nodes(z[d],n[d]), lobatto_weights(z[d],w[d],n[d]); - data = obbox_setup_3((const real *const*)z,(const real *const*)w,n,m); - obbox_calc_3(data,tol,x[0],x[1],x[2],&p->obb[0]); - memcpy(&p->bnd[0],(const real*)&p->obb[0].axis_bnd[0],6*sizeof(real)); - for(i=0;iobb[i]); - obbox_merge_3(&p->bnd[0],(const real*)&p->obb[i].axis_bnd[0]); - } - obbox_free_3(data); - free(z[0]); -} - -static void hash_build_2(hash_data_2 *p, const real *const x[2], - const unsigned n[2], uint nel, - uint max_hash_size, real tol) -{ - uint i,el,size,hn2,sum; unsigned hn; - unsigned *count; - p->obb = tmalloc(obbox_2,nel); - hash_getbb_2(p,x,n,nel,tol); - size = hash_opt_size_2(p,nel,max_hash_size); - p->offset = tmalloc(uint,size); - hn = p->hash_n; - hn2 = (uint)hn*hn; - count = tcalloc(unsigned,hn2); - for(el=0;elmax=count[0]; - p->offset[0]=sum; - for(i=0;ip->max) p->max=count[i]; - sum+=count[i]; - p->offset[i+1]=sum; - } - for(el=0;eloffset[p->offset[index+1] - count[index]] = el; - --count[index]; - } - } - free(count); -} - -static void hash_build_3(hash_data_3 *p, const real *const x[3], - const unsigned n[3], uint nel, - uint max_hash_size, real tol) -{ - uint i,el,size,hn3,sum; unsigned hn; - unsigned *count; - p->obb = tmalloc(obbox_3,nel); - hash_getbb_3(p,x,n,nel,tol); - size = hash_opt_size_3(p,nel,max_hash_size); - p->offset = tmalloc(uint,size); - hn = p->hash_n; - hn3 = (uint)hn*hn*hn; - count = tcalloc(unsigned,hn3); - for(el=0;elmax=count[0]; - p->offset[0]=sum; - for(i=0;ip->max) p->max=count[i]; - sum+=count[i]; - p->offset[i+1]=sum; - } - for(el=0;eloffset[p->offset[index+1] - count[index]] = el; - --count[index]; - } - } - free(count); -} - -static void hash_free_2(hash_data_2 *p) -{ - free(p->obb); - free(p->offset); -} - -static void hash_free_3(hash_data_3 *p) -{ - free(p->obb); - free(p->offset); -} - -/*-------------------------------------------------------------------------- - Optimization algorithm to find a point within an element - - Given x(r) (as values of x,y,z at all Lobatto nodes) and x_star, - find the r that minimizes || x_star - x(r) ||_2 - - As a minimization problem, the Newton step is - - __ 3 - [ J^T J - >_ d=1 resid_d H_d ] dr = J^t resid - - where resid = x_star - x(r), J = [ dx_i/dr_j ], - and H_d = [ d^2 x_d/dr_i dr_j ]. - - This is the appropriate step to take whenever constraints are active, - and the current iterate is on a boundary of the element. When the current - iterate is inside, J is square ( dim r = dim x ), resid will become small, - and the step - - J dr = resid - - may be used instead, still giving quadratic convergence. - - - Names use a _3 suffix for 3-d and _2 for 2-d. - The routines require an initialized lagrange_data array as input: - unsigned d, n[3] = { ... }; - real *z[3] = { tmalloc(real, n[0]), ... }; - for(d=0;d<3;++d) lobatto_nodes(z[d],n[d]); - - lagrange_data ld[3]; - for(d=0;d<3;++d) lagrange_setup(&ld[d],z[d],n[d]); - - Initialization: - opt_data_3 data; - opt_alloc_3(&data, ld); - - Use: - const real *xm[3]; // 3 pointers, each to n[0]*n[1]*n[2] reals - // giving the nodal x, y, or z coordinates - - const real x_star[3] = { ... }; // point to find - real r[3] = { 0,0,0 }; // initial guess with - unsigned c = opt_no_constraints_3; // these constraints active - - real dist = opt_findpt_3(&data,xm,x_star,r,&c); - // minimizer is r with constraints c; 2-norm of resid is dist - - Clean-up: - opt_free_3(&data); - - for(d=0;d<3;++d) lagrange_free(&ld[d]); - for(d=0;d<3;++d) free(z[d]); - - The constraint number works as follows. Let cr be the constraints - on the r variable: - cr = 0 r fixed at -1 - cr = 1 r not fixed - cr = 2 r fixed at 1 - Then the constraint number is (ct*3+cs)*3+cr - - --------------------------------------------------------------------------*/ - -static const unsigned opt_no_constraints_2 = 3+1; -static const unsigned opt_no_constraints_3 = 9+3+1; - -/* how many directions are constrained? */ -static const char opt_constr_num_2[9] = {2,1,2, 1,0,1, 2,1,2}; -static const char opt_constr_num_3[27] = { - 3,2,3, 2,1,2, 3,2,3, - 2,1,2, 1,0,1, 2,1,2, - 3,2,3, 2,1,2, 3,2,3 -}; - -/* which direction is constrained? */ -static const char opt_constr_dir_2[9] = {-1, 1,-1, 0,-1, 0, -1, 1,-1}; -static const char opt_constr_dir_3[27] = { - -1,-1,-1, -1, 2,-1, -1,-1,-1, - -1, 1,-1, 0,-1, 0, -1, 1,-1, - -1,-1,-1, -1, 2,-1, -1,-1,-1 -}; - -/* which direction is not constrained? */ -static const char opt_constr_not[27] = { - -1, 0,-1, 1,-1, 1, -1, 0,-1, - 2,-1, 2, -1,-1,-1, 2,-1, 2, - -1, 0,-1, 1,-1, 1, -1, 0,-1 -}; - -static const char opt_constr_wide[27] = { - 0x00,0x01,0x02, 0x04,0x05,0x06, 0x08,0x09,0x0a, - 0x10,0x11,0x12, 0x14,0x15,0x16, 0x18,0x19,0x1a, - 0x20,0x21,0x22, 0x24,0x25,0x26, 0x28,0x29,0x2a -}; - -static const unsigned opt_other1_3[3] = {1,0,0}, - opt_other2_3[3] = {2,2,1}; - -static unsigned opt_constr(unsigned constraints, unsigned d) -{ - return (opt_constr_wide[constraints]>>(d*2))&3; -} - -static void opt_constr_unpack_2(unsigned constraints, unsigned *c) -{ - const char cw = opt_constr_wide[constraints]; - c[0] = cw & 3; - c[1] = cw >> 2; -} - -static void opt_constr_unpack_3(unsigned constraints, unsigned *c) -{ - const char cw = opt_constr_wide[constraints]; - c[0] = cw & 3; - c[1] = (cw >> 2) & 3; - c[2] = cw >> 4; -} - -static unsigned opt_constr_pack_2(const unsigned *c) -{ - return c[1]*3+c[0]; -} - -static unsigned opt_constr_pack_3(const unsigned *c) -{ - return (c[2]*3+c[1])*3+c[0]; -} - -/*-------------------------------------------------------------------------- - - 3 - D - - --------------------------------------------------------------------------*/ - -typedef struct { - unsigned constraints; - unsigned dn, d1, d2; - real *x[3], *fdn[3]; -} opt_face_data_3; - -typedef struct { - unsigned constraints; - unsigned de, d1, d2; - real *x[3], *fd1[3], *fd2[3]; -} opt_edge_data_3; - -typedef struct { - unsigned constraints; - real x[3], jac[9]; -} opt_point_data_3; - -typedef struct { - lagrange_data *ld; - unsigned size[4]; - const real *elx[3]; - opt_face_data_3 fd; - opt_edge_data_3 ed; - opt_point_data_3 pd; - real *work; - real x[3], jac[9]; -} opt_data_3; - -static void opt_alloc_3(opt_data_3 *p, lagrange_data *ld) -{ - const unsigned nr = ld[0].n, ns = ld[1].n, nt = ld[2].n, - nf = umax_3(nr*ns,nr*nt,ns*nt), - ne = umax_3(nr,ns,nt), - nw = 2*ns*nt + 3*ns; - p->size[0] = 1; - p->size[1] = nr; - p->size[2] = nr*ns; - p->size[3] = p->size[2]*nt; - p->ld = ld; - p->work = tmalloc(real, 6*nf + 9*ne + nw); - p->fd.x[0] = p->work + nw; - p->fd.x[1] = p->fd.x[0] + nf; - p->fd.x[2] = p->fd.x[1] + nf; - p->fd.fdn[0] = p->fd.x[2] + nf; - p->fd.fdn[1] = p->fd.fdn[0] + nf; - p->fd.fdn[2] = p->fd.fdn[1] + nf; - p->ed.x[0] = p->fd.fdn[2] + nf; - p->ed.x[1] = p->ed.x[0] + ne; - p->ed.x[2] = p->ed.x[1] + ne; - p->ed.fd1[0] = p->ed.x[2] + ne; - p->ed.fd1[1] = p->ed.fd1[0] + ne; - p->ed.fd1[2] = p->ed.fd1[1] + ne; - p->ed.fd2[0] = p->ed.fd1[2] + ne; - p->ed.fd2[1] = p->ed.fd2[0] + ne; - p->ed.fd2[2] = p->ed.fd2[1] + ne; -} - -static void opt_free_3(opt_data_3 *p) -{ - free(p->work); -} - -static void opt_vol_set_3(opt_data_3 *p, const real r[3]) -{ - lagrange_1(&p->ld[0],r[0]); - lagrange_1(&p->ld[1],r[1]); - lagrange_1(&p->ld[2],r[2]); -} - -/* work holds 2*ns*nt + 3*ns reals */ -static void opt_vol_intp_3(opt_data_3 *p) -{ - unsigned d; - const lagrange_data *ld = p->ld; - - for(d=0;d<3;++d) - p->x[d] = tensor_ig3(ld[0].J,ld[0].D,ld[0].n, - ld[1].J,ld[1].D,ld[1].n, - ld[2].J,ld[2].D,ld[2].n, - p->elx[d], &p->jac[d*3], p->work); -} - -static void opt_vol_set_intp_3(opt_data_3 *p, const real r[3]) -{ - opt_vol_set_3(p,r); - opt_vol_intp_3(p); -} - -static void opt_face_proj_3(opt_data_3 *p) -{ - unsigned d, off=0; - const unsigned dn = p->fd.dn, d1 = p->fd.d1, d2 = p->fd.d2, - so = p->size[d2]-p->size[d1+1], - s1 = p->size[d1], sn = p->size[dn], - n1 = p->ld[d1].n, n2 = p->ld[d2].n, nn = p->ld[dn].n; - const real *D = p->ld[dn].D_z0; - if(opt_constr(p->fd.constraints,dn)==2) - off = p->size[dn+1]-p->size[dn], - D = p->ld[dn].D_zn; - for(d=0;d<3;++d) { - unsigned i,j,k,index=0; - const real *in = p->elx[d]+off; - for(j=n2;j;--j,in+=so) - for(i=n1;i;--i,++index,in+=s1) { - const real *ind = in-off; - real *fdn = &p->fd.fdn[d][index]; - p->fd.x[d][index] = *in; - *fdn = 0; - for(k=0;kfd.constraints!=constr) { - p->fd.constraints=constr; - p->fd.dn = opt_constr_dir_3[constr]; - p->fd.d1 = opt_other1_3[p->fd.dn]; - p->fd.d2 = opt_other2_3[p->fd.dn]; - opt_face_proj_3(p); - } - lagrange_1(&p->ld[p->fd.d1],r[p->fd.d1]); - lagrange_1(&p->ld[p->fd.d2],r[p->fd.d2]); -} - -/* work holds 2*ld[d2].n reals */ -static void opt_face_intp_3(opt_data_3 *p) -{ - unsigned d; - const unsigned dn = p->fd.dn, d1 = p->fd.d1, d2 = p->fd.d2, - n1 = p->ld[d1].n, n2 = p->ld[d2].n; - const real *J1 = p->ld[d1].J, *J2 = p->ld[d2].J, - *D1 = p->ld[d1].D, *D2 = p->ld[d2].D; - - for(d=0;d<3;++d) { - real g[2]; - p->x[d] = tensor_ig2(J1,D1,n1, J2,D2,n2, p->fd.x[d], &g[0], p->work); - p->jac[d*3+d1] = g[0]; - p->jac[d*3+d2] = g[1]; - p->jac[d*3+dn] = tensor_i2(J1,n1, J2,n2, p->fd.fdn[d], p->work); - } -} - -static void opt_face_set_intp_3(opt_data_3 *p, const real r[3], unsigned constr) -{ - opt_face_set_3(p,r,constr); - opt_face_intp_3(p); -} - -static void opt_face_hess_3(opt_data_3 *p, real hess[9]) -{ - unsigned d; - const unsigned d1 = p->fd.d1, d2 = p->fd.d2, - n1 = p->ld[d1].n, n2 = p->ld[d2].n; - const real *J1 = p->ld[d1].J , *J2 = p->ld[d2].J, - *D1 = p->ld[d1].D , *D2 = p->ld[d2].D, - *S1 = p->ld[d1].D2, *S2 = p->ld[d2].D2; - - lagrange_2u(&p->ld[d1]); - lagrange_2u(&p->ld[d2]); - - for(d=0;d<3;++d) { - (void) tensor_ig2(J1,S1,n1, J2,S2,n2, p->fd.x[d], hess+d*3, p->work); - hess[d*3+0] = tensor_i2(S1,n1, J2,n2, p->fd.x[d], p->work); - hess[d*3+1] = tensor_i2(J1,n1, S2,n2, p->fd.x[d], p->work); - hess[d*3+2] = tensor_i2(D1,n1, D2,n2, p->fd.x[d], p->work); - } -} - -static void opt_edge_proj_3(opt_data_3 *p) -{ - unsigned d, off, off1=0, off2=0; - const unsigned de=p->ed.de, d1=p->ed.d1, d2=p->ed.d2, - se=p->size[de], s1=p->size[d1], s2=p->size[d2], - ne=p->ld[de].n, n1=p->ld[d1].n, n2=p->ld[d2].n; - const real *fD1, *fD2; - if(opt_constr(p->ed.constraints,d1)==0) - fD1=p->ld[d1].D_z0; - else - fD1=p->ld[d1].D_zn, off1 = p->size[d1+1]-p->size[d1]; - if(opt_constr(p->ed.constraints,d2)==0) - fD2=p->ld[d2].D_z0; - else - fD2=p->ld[d2].D_zn, off2 = p->size[d2+1]-p->size[d2]; - off = off1+off2; - for(d=0;d<3;++d) { - unsigned i,j; - const real *in = p->elx[d]+off; - for(i=0;ied.fd1[d][i], *fd2 = &p->ed.fd2[d][i]; - p->ed.x[d][i] = *in; - *fd1 = *fd2 = 0; - for(j=0;jed.constraints!=constr) { - p->ed.constraints=constr; - p->ed.de = opt_constr_not[constr]; - p->ed.d1 = opt_other1_3[p->ed.de]; - p->ed.d2 = opt_other2_3[p->ed.de]; - opt_edge_proj_3(p); - } - lagrange_1(&p->ld[p->ed.de],r[p->ed.de]); -} - -static void opt_edge_intp_3(opt_data_3 *p) -{ - unsigned d; - const unsigned de = p->ed.de, d1 = p->ed.d1, d2 = p->ed.d2, - n = p->ld[de].n; - const real *J = p->ld[de].J, *D = p->ld[de].D; - - for(d=0;d<3;++d) { - p->x[d] = tensor_ig1(J,D,n, p->ed.x[d], &p->jac[d*3+de]); - p->jac[d*3+d1] = tensor_i1(J,n, p->ed.fd1[d]); - p->jac[d*3+d2] = tensor_i1(J,n, p->ed.fd2[d]); - } -} - -static void opt_edge_set_intp_3(opt_data_3 *p, const real r[3], unsigned constr) -{ - opt_edge_set_3(p,r,constr); - opt_edge_intp_3(p); -} - -static void opt_edge_hess_3(opt_data_3 *p, real hess[3]) -{ - unsigned d; - const unsigned de = p->ed.de, n = p->ld[de].n; - const real *D2 = p->ld[de].D2; - lagrange_2u(&p->ld[de]); - for(d=0;d<3;++d) hess[d] = tensor_i1(D2,n, p->ed.x[d]); -} - -static void opt_point_proj_3(opt_data_3 *p) -{ - unsigned off[3], offt, d, c[3]; - const real *fD[3]; - opt_constr_unpack_3(p->pd.constraints,c); - for(d=0;d<3;++d) - if(c[d]==0) - fD[d]=p->ld[d].D_z0,off[d]=0; - else - fD[d]=p->ld[d].D_zn,off[d]=p->size[d+1]-p->size[d]; - offt = off[0]+off[1]+off[2]; - for(d=0;d<9;++d) p->pd.jac[d]=0; - for(d=0;d<3;++d) { - unsigned i,j; - p->pd.x[d] = p->elx[d][offt]; - for(i=0;i<3;++i) { - const real *in = p->elx[d]+offt-off[i]; - for(j=0;jld[i].n;++j,in+=p->size[i]) - p->pd.jac[d*3+i] += *in * fD[i][j]; - } - } -} - -static void opt_point_set_3(opt_data_3 *p, unsigned constr) -{ - if(p->pd.constraints!=constr) { - p->pd.constraints=constr; - opt_point_proj_3(p); - } -} - -static void opt_point_intp_3(opt_data_3 *p) -{ - memcpy(p->x,p->pd.x,3*sizeof(real)); - memcpy(p->jac,p->pd.jac,9*sizeof(real)); -} - -static void opt_point_set_intp_3(opt_data_3 *p, unsigned constr) -{ - opt_point_set_3(p,constr); - opt_point_intp_3(p); -} - -#define DIAGNOSTICS 0 - -static double opt_findpt_3(opt_data_3 *p, const real *const elx[3], - const real xstar[3], real r[3], unsigned *constr) -{ - real dr[3], resid[3], steep[3]; - - unsigned c=*constr,ac,d,cc[3],step=0; - - p->elx[0]=elx[0], p->elx[1]=elx[1], p->elx[2]=elx[2]; - - p->fd.constraints = opt_no_constraints_3; - p->ed.constraints = opt_no_constraints_3; - p->pd.constraints = opt_no_constraints_3; - -# if DIAGNOSTICS - printf("opt_findpt: xstar = %g, %g, %g\n", xstar[0], xstar[1], xstar[2]); -# endif - - do { - ++step; - if(step==50) fail("%s: opt_findpt_3 did not converge\n",__FILE__); -# if DIAGNOSTICS - printf(" iteration %u\n", step); - printf(" %d constraint(s) active\n", (int)opt_constr_num_3[c]); -# endif - /* update face/edge/point data if necessary, - and evaluate x(r) as well as the jacobian */ - switch(opt_constr_num_3[c]) { - case 0: opt_vol_set_intp_3(p,r); break; - case 1: opt_face_set_intp_3(p,r,c); break; - case 2: opt_edge_set_intp_3(p,r,c); break; - case 3: opt_point_set_intp_3(p,c); break; - } -# if DIAGNOSTICS - printf(" r = %g, %g, %g\n", r[0], r[1], r[2]); - printf(" x = %g, %g, %g\n", p->x[0], p->x[1], p->x[2]); -# endif - /* compute residual */ - for(d=0;d<3;++d) resid[d]=xstar[d]-p->x[d]; -# if DIAGNOSTICS - printf(" resid = %g, %g, %g\n", resid[0], resid[1], resid[2]); - printf(" 2-norm = %g\n", r2norm_3(resid[0],resid[1],resid[2])); -# endif - /* check constraints against steepest descent direction */ - ac = c; - if(opt_constr_num_3[c]) { - opt_constr_unpack_3(c,cc); - mat_app_3c(steep,p->jac,resid); /* steepest descent = J^T r */ -# if DIAGNOSTICS - printf(" steepest descent = %g, %g, %g\n", steep[0],steep[1],steep[2]); -# endif - for(d=0;d<3;++d) - if((cc[d]==0 && steep[d]>0) || (cc[d]==2 && steep[d]<0)) cc[d]=1; - ac = opt_constr_pack_3(cc); - } - /* update face/edge/point data if necessary */ - if(ac!=c) { - c=ac; -# if DIAGNOSTICS - printf(" relaxed to %d constraints\n", (int)opt_constr_num_3[c]); -# endif - switch(opt_constr_num_3[c]) { - case 1: opt_face_set_3(p,r,c); break; - case 2: opt_edge_set_3(p,r,c); break; - case 3: opt_point_set_3(p,c); break; - } - } - /* compute Newton step */ - switch(opt_constr_num_3[c]) { - case 0: tinyla_solve_3(dr,p->jac,resid); break; - case 1: { - const unsigned dn = p->fd.dn, d1 = p->fd.d1, d2 = p->fd.d2; - real A[4], H[9]; - const real *J = p->jac; - opt_face_hess_3(p,H); - A[0] = J[d1]*J[d1] + J[3+d1]*J[3+d1] + J[6+d1]*J[6+d1]; - A[1] = J[d2]*J[d2] + J[3+d2]*J[3+d2] + J[6+d2]*J[6+d2]; - A[2] = J[d1]*J[d2] + J[3+d1]*J[3+d2] + J[6+d1]*J[6+d2]; - A[0] -= resid[0]*H[0] + resid[1]*H[3] + resid[2]*H[6]; - A[1] -= resid[0]*H[1] + resid[1]*H[4] + resid[2]*H[7]; - A[2] -= resid[0]*H[2] + resid[1]*H[5] + resid[2]*H[8]; - tinyla_solve_sym_2(&dr[d1],&dr[d2],A,steep[d1],steep[d2]); - dr[dn]=0; - } break; - case 2: { - const unsigned de = p->ed.de, d1 = p->ed.d1, d2 = p->ed.d2; - real fac, H[3]; - const real *J = p->jac+de; - opt_edge_hess_3(p,H); - fac = J[0]*J[0]+J[3]*J[3]+J[6]*J[6] - -(resid[0]*H[0]+resid[1]*H[1]+resid[2]*H[2]); - dr[de] = steep[de] / fac; - dr[d1] = 0, dr[d2] = 0; - } break; - case 3: - dr[0] = dr[1] = dr[2] = 0; - break; - } -# if DIAGNOSTICS - printf(" dr = %g, %g, %g\n", dr[0], dr[1], dr[2]); -# endif - /* project new iteration onto [-1,1]^3 */ - opt_constr_unpack_3(c,cc); - for(d=0;d<3;++d) { - if(cc[d]!=1) continue; - r[d] += dr[d]; - if(r[d] <= -1) - dr[d] -= r[d]+1, r[d] = -1, cc[d]=0; - else if(r[d] >= 1) - dr[d] -= r[d]-1, r[d] = 1, cc[d]=2; - } - c = opt_constr_pack_3(cc); - } while(r1norm_3(dr[0],dr[1],dr[2]) > 30*EPS); - *constr = c; -# if 0 - printf("opt_findpt_3 converged in %u iterations\n", step); -# endif - return r2norm_3(resid[0],resid[1],resid[2]); -} - -#undef DIAGNOSTICS - -/*-------------------------------------------------------------------------- - - 2 - D - - --------------------------------------------------------------------------*/ - -typedef struct { - unsigned constraints; - unsigned de, d1; - real *x[2], *fd1[2]; -} opt_edge_data_2; - -typedef struct { - unsigned constraints; - real x[2], jac[4]; -} opt_point_data_2; - -typedef struct { - lagrange_data *ld; - unsigned size[3]; - const real *elx[2]; - opt_edge_data_2 ed; - opt_point_data_2 pd; - real *work; - real x[2], jac[4]; -} opt_data_2; - -static void opt_alloc_2(opt_data_2 *p, lagrange_data *ld) -{ - const unsigned nr = ld[0].n, ns = ld[1].n, - ne = umax_2(nr,ns), - nw = 2*ns; - p->size[0] = 1; - p->size[1] = nr; - p->size[2] = nr*ns; - p->ld = ld; - p->work = tmalloc(real, 4*ne + nw); - p->ed.x[0] = p->work + nw; - p->ed.x[1] = p->ed.x[0] + ne; - p->ed.fd1[0] = p->ed.x[1] + ne; - p->ed.fd1[1] = p->ed.fd1[0] + ne; -} - -static void opt_free_2(opt_data_2 *p) -{ - free(p->work); -} - -static void opt_area_set_2(opt_data_2 *p, const real r[2]) -{ - lagrange_1(&p->ld[0],r[0]); - lagrange_1(&p->ld[1],r[1]); -} - -/* work holds 2*ns reals */ -static void opt_area_intp_2(opt_data_2 *p) -{ - unsigned d; - const lagrange_data *ld = p->ld; - - for(d=0;d<2;++d) - p->x[d] = tensor_ig2(ld[0].J,ld[0].D,ld[0].n, - ld[1].J,ld[1].D,ld[1].n, - p->elx[d], &p->jac[d*2], p->work); -} - -static void opt_area_set_intp_2(opt_data_2 *p, const real r[2]) -{ - opt_area_set_2(p,r); - opt_area_intp_2(p); -} - -static void opt_edge_proj_2(opt_data_2 *p) -{ - unsigned d, off=0; - const unsigned de = p->ed.de, d1 = p->ed.d1, - se=p->size[de], s1=p->size[d1], - ne=p->ld[de].n, n1=p->ld[d1].n; - const real *fD1; - if(opt_constr(p->ed.constraints,d1)==0) - fD1=p->ld[d1].D_z0; - else - fD1=p->ld[d1].D_zn, off = p->size[d1+1]-p->size[d1]; - for(d=0;d<2;++d) { - unsigned i,j; - const real *in = p->elx[d]+off; - for(i=0;ied.fd1[d][i]; - p->ed.x[d][i] = *in; - *fd1 = 0; - for(j=0;jed.constraints!=constr) { - p->ed.constraints=constr; - p->ed.de = opt_constr_not[constr]; - p->ed.d1 = 1 - p->ed.de; - opt_edge_proj_2(p); - } - lagrange_1(&p->ld[p->ed.de],r[p->ed.de]); -} - -static void opt_edge_intp_2(opt_data_2 *p) -{ - unsigned d; - const unsigned de = p->ed.de, d1 = p->ed.d1, n = p->ld[de].n; - const real *J = p->ld[de].J, *D = p->ld[de].D; - for(d=0;d<2;++d) { - p->x[d] = tensor_ig1(J,D,n, p->ed.x[d], &p->jac[d*2+de]); - p->jac[d*2+d1] = tensor_i1(J,n, p->ed.fd1[d]); - } -} - -static void opt_edge_set_intp_2(opt_data_2 *p, const real r[2], unsigned constr) -{ - opt_edge_set_2(p,r,constr); - opt_edge_intp_2(p); -} - -static void opt_edge_hess_2(opt_data_2 *p, real hess[2]) -{ - unsigned d; - const unsigned de = p->ed.de, n = p->ld[de].n; - const real *D2 = p->ld[de].D2; - lagrange_2u(&p->ld[de]); - for(d=0;d<2;++d) hess[d] = tensor_i1(D2,n, p->ed.x[d]); -} - -static void opt_point_proj_2(opt_data_2 *p) -{ - unsigned off[2], offt, d, c[2]; - const real *fD[2]; - opt_constr_unpack_2(p->pd.constraints,c); - for(d=0;d<2;++d) - if(c[d]==0) - fD[d]=p->ld[d].D_z0,off[d]=0; - else - fD[d]=p->ld[d].D_zn,off[d]=p->size[d+1]-p->size[d]; - offt = off[0]+off[1]; - for(d=0;d<4;++d) p->pd.jac[d]=0; - for(d=0;d<2;++d) { - unsigned i,j; - p->pd.x[d] = p->elx[d][offt]; - for(i=0;i<2;++i) { - const real *in = p->elx[d]+offt-off[i]; - for(j=0;jld[i].n;++j,in+=p->size[i]) - p->pd.jac[d*2+i] += *in * fD[i][j]; - } - } -} - -static void opt_point_set_2(opt_data_2 *p, unsigned constr) -{ - if(p->pd.constraints!=constr) { - p->pd.constraints=constr; - opt_point_proj_2(p); - } -} - -static void opt_point_intp_2(opt_data_2 *p) -{ - memcpy(p->x,p->pd.x,2*sizeof(real)); - memcpy(p->jac,p->pd.jac,4*sizeof(real)); -} - -static void opt_point_set_intp_2(opt_data_2 *p, unsigned constr) -{ - opt_point_set_2(p,constr); - opt_point_intp_2(p); -} - -#define DIAGNOSTICS 0 - -static double opt_findpt_2(opt_data_2 *p, const real *const elx[2], - const real xstar[2], real r[2], unsigned *constr) -{ - real dr[2], resid[2], steep[2]; - - unsigned c=*constr,ac,d,cc[2],step=0; - - p->elx[0]=elx[0], p->elx[1]=elx[1]; - - p->ed.constraints = opt_no_constraints_2; - p->pd.constraints = opt_no_constraints_2; - -# if DIAGNOSTICS - printf("opt_findpt: xstar = %g, %g\n", xstar[0], xstar[1]); -# endif - - do { - ++step; - if(step==150) fail("%s: opt_findpt_2 did not converge\n",__FILE__); -# if DIAGNOSTICS - printf(" iteration %u\n", step); - printf(" %d constraint(s) active\n", (int)opt_constr_num_2[c]); -# endif - /* update face/edge/point data if necessary, - and evaluate x(r) as well as the jacobian */ - switch(opt_constr_num_2[c]) { - case 0: opt_area_set_intp_2(p,r); break; - case 1: opt_edge_set_intp_2(p,r,c); break; - case 2: opt_point_set_intp_2(p,c); break; - } -# if DIAGNOSTICS - printf(" r = %g, %g\n", r[0], r[1]); - printf(" x = %g, %g\n", p->x[0], p->x[1]); -# endif - /* compute residual */ - for(d=0;d<2;++d) resid[d]=xstar[d]-p->x[d]; -# if DIAGNOSTICS - printf(" resid = %g, %g\n", resid[0], resid[1]); - printf(" 2-norm = %g\n", r2norm_2(resid[0],resid[1])); -# endif - /* check constraints against steepest descent direction */ - ac = c; - if(opt_constr_num_2[c]) { - opt_constr_unpack_2(c,cc); - mat_app_2c(steep,p->jac,resid); /* steepest descent = J^T r */ -# if DIAGNOSTICS - printf(" steepest descent = %g, %g\n", steep[0], steep[1]); -# endif - for(d=0;d<2;++d) - if((cc[d]==0 && steep[d]>0) || (cc[d]==2 && steep[d]<0)) cc[d]=1; - ac = opt_constr_pack_2(cc); - } - /* update face/edge/point data if necessary */ - if(ac!=c) { - c=ac; -# if DIAGNOSTICS - printf(" relaxed to %d constraints\n", (int)opt_constr_num_2[c]); -# endif - switch(opt_constr_num_2[c]) { - case 1: opt_edge_set_2(p,r,c); break; - case 2: opt_point_set_2(p,c); break; - } - } - /* compute Newton step */ - switch(opt_constr_num_2[c]) { - case 0: tinyla_solve_2(dr,p->jac,resid); break; - case 1: { - const unsigned de = p->ed.de, d1 = p->ed.d1; - real fac, H[2]; - const real *J = p->jac+de; - opt_edge_hess_2(p,H); - fac = J[0]*J[0]+J[2]*J[2]-(resid[0]*H[0]+resid[1]*H[1]); - dr[de] = steep[de] / fac; - dr[d1] = 0; - } break; - case 2: - dr[0] = dr[1] = 0; - break; - } -# if DIAGNOSTICS - printf(" dr = %g, %g\n", dr[0], dr[1]); -# endif - /* project new iteration onto [-1,1]^2 */ - opt_constr_unpack_2(c,cc); - for(d=0;d<2;++d) { - if(cc[d]!=1) continue; - r[d] += dr[d]; - if(r[d] <= -1) - dr[d] -= r[d]+1, r[d] = -1, cc[d]=0; - else if(r[d] >= 1) - dr[d] -= r[d]-1, r[d] = 1, cc[d]=2; - } - c = opt_constr_pack_2(cc); - } while(r1norm_2(dr[0],dr[1]) > 30*EPS); - *constr = c; - return r2norm_2(resid[0],resid[1]); -} - -#undef DIAGNOSTICS - -/*-------------------------------------------------------------------------- - Point Finding (interface/top-level) - - Initializing the data: - unsigned nel; // number of elements - const unsigned n[3]; // number of nodes in r, s, t directions - const real *xm[3]; // n[0]*n[1]*n[2]*nel x,y,z coordinates - real tol = 0.01; // how far point is allowed to be outside element - // relative to element size - unsigned max_size = n[0]*n[1]*n[2]*nel; // maximum size of hash table - - findpt_data_3 *data = findpt_setup_3(xm,n,nel,max_size,tol); - - Using the data: - real x[3] = { ... }; // point to find - int el; // element number - real r[3]; // parametric coordinates - int guess = 0; // do we use (el,r,s,t) as an initial guess? - int code; // 0 : normal, -1 : outside all elements, - // 1 : border, or outside but within tolerance - real dist; // distance in xyz space from returned (el,r,s,t) to given - // (x,y,z) - - code = findpt_3(data, x, guess, &el, r, &dist); - - When done: - findpt_free_3(&data); - - --------------------------------------------------------------------------*/ - -typedef struct { - uint el; - real r[3]; - real dist; -} findpt_listel; - -/* heap sort on A[0:n-1] with key A[i]->dist - precondition: n!=0 */ -static void findpt_list_sort(findpt_listel **A, unsigned n) -{ - unsigned i; - --A; /* make A have a base index of 1 */ - /* build heap */ - for(i=2;i<=n;++i) { - findpt_listel *item = A[i]; - unsigned hole = i, parent = hole>>1; - if(A[parent]->dist >= item->dist) continue; - do { - A[hole] = A[parent]; - hole = parent; - parent>>=1; - } while(parent && A[parent]->dist < item->dist); - A[hole] = item; - } - /* extract */ - for(i=n-1;i;--i) { - findpt_listel *item = A[i+1]; - unsigned hole = 1; - A[i+1] = A[1]; - for(;;) { - unsigned ch = hole<<1, r = ch+1; - if(r<=i && A[ch]->dist < A[r]->dist) ch=r; - if(ch>i || item->dist >= A[ch]->dist) break; - A[hole]=A[ch]; - hole=ch; - } - A[hole] = item; - } -} - -typedef struct { - const real *xw[2]; /* geometry data */ - real *z[2]; /* lobatto nodes */ - lagrange_data ld[2]; /* interpolation, derivative weights & data */ - unsigned nptel; /* nodes per element */ - hash_data_2 *hash; /* geometric hashing data */ - findpt_listel *list, **sorted, **end; /* pre-allocated list of elements to - check (found by hashing), and - pre-allocated list of pointers into - the first list for sorting */ - opt_data_2 *od; /* data for the optimization algorithm */ - real *od_work; -} findpt_data_2; - -typedef struct { - const real *xw[3]; /* geometry data */ - real *z[3]; /* lobatto nodes */ - lagrange_data ld[3]; /* interpolation, derivative weights & data */ - unsigned nptel; /* nodes per element */ - hash_data_3 *hash; /* geometric hashing data */ - findpt_listel *list, **sorted, **end; /* pre-allocated list of elements to - check (found by hashing), and - pre-allocated list of pointers into - the first list for sorting */ - opt_data_3 *od; /* data for the optimization algorithm */ - real *od_work; -} findpt_data_3; - -findpt_data_2 *findpt_setup_2( - const real *const xw[2], const unsigned n[2], uint nel, - uint max_hash_size, real bbox_tol) -{ - unsigned d; - findpt_data_2 *p = tmalloc(findpt_data_2,1); - - p->hash = tmalloc(hash_data_2,1); - p->od = tmalloc(opt_data_2,1); - - for(d=0;d<2;++d) p->xw[d]=xw[d]; - p->nptel = n[0]*n[1]; - - hash_build_2(p->hash,xw,n,nel,max_hash_size,bbox_tol); - - for(d=0;d<2;++d) { - p->z[d] = tmalloc(real,n[d]); - lobatto_nodes(p->z[d],n[d]); - lagrange_setup(&p->ld[d],p->z[d],n[d]); - } - - p->list = tmalloc(findpt_listel , p->hash->max); - p->sorted = tmalloc(findpt_listel*, p->hash->max); - - opt_alloc_2(p->od,p->ld); - p->od_work = p->od->work; - - return p; -} - -findpt_data_3 *findpt_setup_3( - const real *const xw[3], const unsigned n[3], uint nel, - uint max_hash_size, real bbox_tol) -{ - unsigned d; - findpt_data_3 *p = tmalloc(findpt_data_3,1); - - p->hash = tmalloc(hash_data_3,1); - p->od = tmalloc(opt_data_3,1); - - for(d=0;d<3;++d) p->xw[d]=xw[d]; - p->nptel = n[0]*n[1]*n[2]; - - hash_build_3(p->hash,xw,n,nel,max_hash_size,bbox_tol); - - for(d=0;d<3;++d) { - p->z[d] = tmalloc(real,n[d]); - lobatto_nodes(p->z[d],n[d]); - lagrange_setup(&p->ld[d],p->z[d],n[d]); - } - - p->list = tmalloc(findpt_listel , p->hash->max); - p->sorted = tmalloc(findpt_listel*, p->hash->max); - - opt_alloc_3(p->od,p->ld); - p->od_work = p->od->work; - - return p; -} - -void findpt_free_2(findpt_data_2 *p) -{ - unsigned d; - opt_free_2(p->od); free(p->od); - hash_free_2(p->hash); free(p->hash); - free(p->list); - free(p->sorted); - for(d=0;d<2;++d) free(p->z[d]); - free(p); -} - -void findpt_free_3(findpt_data_3 *p) -{ - unsigned d; - opt_free_3(p->od); free(p->od); - hash_free_3(p->hash); free(p->hash); - free(p->list); - free(p->sorted); - for(d=0;d<3;++d) free(p->z[d]); - free(p); -} - -const real *findpt_allbnd_2(const findpt_data_2 *p) -{ - return p->hash->bnd; -} - -const real *findpt_allbnd_3(const findpt_data_3 *p) -{ - return p->hash->bnd; -} - -static void findpt_hash_2(findpt_data_2 *p, const real x[2]) -{ - findpt_listel *list = p->list, **sorted = p->sorted; - const uint hi = hash_index_2(p->hash, x); - const uint *offset = p->hash->offset; - uint i; const uint b = offset[hi], e = offset[hi+1]; - for(i=b;i!=e;++i) { - const uint el = offset[i]; - real *r = &list->r[0]; - const obbox_2 *obb = &p->hash->obb[el]; - if(obbox_axis_test_2(obb,x)) continue; - if(obbox_test_2(obb,x,r)) continue; - list->el = el; - list->dist = r1norm_2(r[0],r[1]); - *sorted++ = list++; - } - p->end = sorted; - if(p->end!=p->sorted) - findpt_list_sort(p->sorted,p->end - p->sorted); -} - -static void findpt_hash_3(findpt_data_3 *p, const real x[3]) -{ - findpt_listel *list = p->list, **sorted = p->sorted; - const uint hi = hash_index_3(p->hash, x); - const uint *offset = p->hash->offset; - uint i; const uint b = offset[hi], e = offset[hi+1]; - for(i=b;i!=e;++i) { - const uint el = offset[i]; - real *r = &list->r[0]; - const obbox_3 *obb = &p->hash->obb[el]; - if(obbox_axis_test_3(obb,x)) continue; - if(obbox_test_3(obb,x,r)) continue; - list->el = el; - list->dist = r1norm_3(r[0],r[1],r[2]); - *sorted++ = list++; - } - p->end = sorted; - if(p->end!=p->sorted) - findpt_list_sort(p->sorted,p->end - p->sorted); -} - -static int findpt_guess_2(findpt_data_2 *p, const real x[2], - uint el, real r[2], real *dist) -{ - const uint index = p->nptel*el; - const real *elx[2] = {p->xw[0]+index,p->xw[1]+index}; - real g[2]; - unsigned c = opt_no_constraints_2; - const obbox_2 *obb = &p->hash->obb[el]; - if(obbox_axis_test_2(obb,x) || obbox_test_2(obb,x,g)) return 0; - *dist = opt_findpt_2(p->od,elx,x,r,&c); - return c==opt_no_constraints_2; -} - -static int findpt_guess_3(findpt_data_3 *p, const real x[3], - uint el, real r[3], real *dist) -{ - const uint index = p->nptel*el; - const real *elx[3] = {p->xw[0]+index,p->xw[1]+index,p->xw[2]+index}; - real g[3]; - unsigned c = opt_no_constraints_3; - const obbox_3 *obb = &p->hash->obb[el]; - if(obbox_axis_test_3(obb,x) || obbox_test_3(obb,x,g)) return 0; - *dist = opt_findpt_3(p->od,elx,x,r,&c); - return c==opt_no_constraints_3; -} - -#define DIAGNOSTICS 0 - -static int findpt_pass_2(findpt_data_2 *p, const real x[2], - uint *el, real r[2], real *dist_min) -{ - findpt_listel **qq = p->sorted; - const real *bnd; - do { - findpt_listel *q = *qq; - const uint index = p->nptel*q->el; - const real *elx[2] = {p->xw[0]+index,p->xw[1]+index}; - unsigned c = opt_no_constraints_2; - const real dist = opt_findpt_2(p->od,elx,x,q->r,&c); - if(qq==p->sorted || dist<*dist_min || c==opt_no_constraints_2) { - *dist_min = dist; - *el = q->el; - memcpy(r, q->r, 2*sizeof(real)); - if(c==opt_no_constraints_2) return 0; - } - } while(++qq != p->end); - bnd = p->hash->obb[*el].axis_bnd; - return *dist_min>r2norm_2(bnd[1]-bnd[0],bnd[3]-bnd[2]) ? -1 : 1; -} - -static int findpt_pass_3(findpt_data_3 *p, const real x[3], - uint *el, real r[3], real *dist_min) -{ - findpt_listel **qq = p->sorted; - const real *bnd; - do { - findpt_listel *q = *qq; - const uint index = p->nptel*q->el; - const real *elx[3] = {p->xw[0]+index,p->xw[1]+index,p->xw[2]+index}; - unsigned c = opt_no_constraints_3; - const real dist = opt_findpt_3(p->od,elx,x,q->r,&c); - if(qq==p->sorted || dist<*dist_min || c==opt_no_constraints_3) { - *dist_min = dist; - *el = q->el; - memcpy(r, q->r, 3*sizeof(real)); - if(c==opt_no_constraints_3) { -# if DIAGNOSTICS - printf("point found in element #%d\n", qq-p->sorted); -# endif - return 0; - } - } - } while(++qq != p->end); - bnd = p->hash->obb[*el].axis_bnd; - return *dist_min>r2norm_3(bnd[1]-bnd[0],bnd[3]-bnd[2],bnd[5]-bnd[4]) ? -1 : 1; -} - -int findpt_2(findpt_data_2 *p, const real x[2], int guess, - uint *el, real r[2], real *dist) -{ - if(guess && findpt_guess_2(p,x,*el,r,dist)) return 0; - findpt_hash_2(p,x); - if(p->sorted==p->end) return -1; - return findpt_pass_2(p,x,el,r,dist); -} - -int findpt_3(findpt_data_3 *p, const real x[3], int guess, - uint *el, real r[3], real *dist) -{ - if(guess && findpt_guess_3(p,x,*el,r,dist)) return 0; - findpt_hash_3(p,x); -# if DIAGNOSTICS - printf("hashing leaves %d elements to consider\n",p->end-p->sorted); -# endif - if(p->sorted==p->end) return -1; - return findpt_pass_3(p,x,el,r,dist); -} - -static void findpt_weights_2(findpt_data_2 *p, const real r[2]) -{ - lagrange_0(&p->ld[0],r[0]); - lagrange_0(&p->ld[1],r[1]); -} - -static void findpt_weights_3(findpt_data_3 *p, const real r[3]) -{ - lagrange_0(&p->ld[0],r[0]); - lagrange_0(&p->ld[1],r[1]); - lagrange_0(&p->ld[2],r[2]); -} - -static double findpt_eval_2(findpt_data_2 *p, const real *u) -{ - return tensor_i2(p->ld[0].J,p->ld[0].n, - p->ld[1].J,p->ld[1].n, - u, p->od_work); -} - -static double findpt_eval_3(findpt_data_3 *p, const real *u) -{ - return tensor_i3(p->ld[0].J,p->ld[0].n, - p->ld[1].J,p->ld[1].n, - p->ld[2].J,p->ld[2].n, - u, p->od_work); -} - diff --git a/3rdParty/gslib.github/src/findpts2.c b/3rdParty/gslib.github/src/findpts2.c deleted file mode 100644 index 125a33fac..000000000 --- a/3rdParty/gslib.github/src/findpts2.c +++ /dev/null @@ -1,196 +0,0 @@ -/*------------------------------------------------------------------------------ - - FORTRAN interface for pfindpt - - integer ch - call crystal_new(ch) - - integer h - integer ndim ! = 2 or 3 - integer nr, ns, nt, nel - real xm1(nr,ns,nt,nel), ym1(nr,ns,nt,nel), zm1(nr,ns,nt,nel) - real tolerance - - call findpts_new(h,ch,ndim,xm1,ym1,zm1,nr,ns,nt,nel,tolerance) - - The returned handle will use the given crystal router handle for - communication, and will be able to locate lists of points that are - within the given mesh to the specified tolerance relative to element size - (e.g. tolerance = 0.1 or 1.0e-10) - - The list of points must have the format: - - integer mi ! >= 3 - integer mr ! >= 1 + 2*ndim - integer n, max - integer vi(mi,max) - real vr(mr,max) - - For point j, j in [1 ... n], n <= max, - vi(1,j) = processor number (0 to P-1) - vi(2,j) = element number (1 to vi(1,j)'s nel) - vi(3,j) = code (-1, 0, 1; explained below) - vr(1,j) = distance (from located point to given point) - vr(2,j) = x (input) - vr(3,j) = y (input) - vr(4,j) = z (input; only when ndim=3) - vr(ndim+2,j) = r (output) - vr(ndim+3,j) = s (output) - vr(ndim+4,j) = t (output; only when ndim=3) - - To locate points: - - call findpts(h, n, vi, mi, vr, mr, guess) - - - On input, only the xyz fields are used; the rest are set as output. - The exception is if guess is non-zero, then element number and parametric - coords will be used as an initial guess. - - The code is set as follows: - 0 : normal - -1 : point not within mesh (to within given tolerance) - 1 : point either exactly on element boundary, or outside mesh - (but within given tolerance); in this case the returned distance - can be used to test if the point is really outside the mesh - - To transfer points: - - call findpts_transfer(h,n,max,vi,mi,vr,mr) - - - This is just a small wrapper over crystal_transfer - - The error condition (more incoming points than max) is indicated by - n .eq. max + 1 on return; - so, to ignore errors one must use: - - call findpts_transfer(h,n,max,vi,mi,vr,mr) - if(n.eq. max+1) n=max - - To evaluate scalar fields for point j (must be local): - - real scalar1(nr,ns,nt,nel), scalar2(nr,ns,nt,nel) - real val1, val2 - real r, s, t - integer el - - r = vr(ndim+2,j) - s = vr(ndim+3,j) - if(ndim.eq.3) t = vr(ndim+4,j) - el = vi(1,j) - - call findpts_weights(h, r,s,t) - call findpts_eval(h, val1, scalar1(1,1,1,el)) ! sets val1 - call findpts_eval(h, val2, scalar2(1,1,1,el)) ! sets val2 - ... - - ----------------------------------------------------------------------------*/ - -#include -#include -#include -#ifdef MPI -# include -#endif - -#include "fname.h" -#include "errmem.h" -#include "types.h" -#ifdef MPI -# include "crystal.h" -#endif -#include "tuple_list.h" -#include "pfindpt.h" - -#define findpts_new FORTRAN_NAME(findpts_new,FINDPTS_NEW) -#define findpts_done FORTRAN_NAME(findpts_done,FINDPTS_DONE) -#define findpts_transfer FORTRAN_NAME(findpts_transfer,FINDPTS_TRANSFER) -#define findpts FORTRAN_NAME(findpts,FINDPTS) -#define findpts_weights FORTRAN_NAME(findpts_weights,FINDPTS_WEIGHTS) -#define findpts_eval FORTRAN_NAME(findpts_eval,FINDPTS_EVAL) -#define ftuple_list_sort FORTRAN_NAME(ftuple_list_sort,FTUPLE_LIST_SORT) - - -static pfindpt_data **handle=0; -static unsigned n=0, max=0; - -#ifdef MPI -crystal_data *fcrystal_handle(sint h); -#endif - -void findpts_new(sint *h, const sint *ch, const sint *ndim, - const real xm1[], const real ym1[], const real zm1[], - const sint *nr, const sint *ns, const sint *nt, - const sint *nel, const real *tol) -{ - const real *xw[3] = {xm1,ym1,zm1}; - unsigned nm[3] = {*nr,*ns,*nt}; -#ifdef MPI - crystal_data *crystal = fcrystal_handle(*ch); -#else - void *crystal = 0; -#endif - if(n==max) max+=max/2+1,handle=trealloc(pfindpt_data*,handle,max); - if(*ndim==2) nm[2]=1; - handle[n] = pfindpt_setup(*ndim, xw, nm, *nel, *nel*nm[0]*nm[1]*nm[2], *tol, - crystal); - *h=n++; -} - -static pfindpt_data *findpts_handle(sint h) -{ - if((unsigned)h>=n || handle[h]==0) failwith("invalid findpts handle"); - return handle[h]; -} - -void findpts_done(sint *h) -{ - pfindpt_data *p = findpts_handle(*h); - handle[*h]=0; - pfindpt_free(p); -} - -void findpts_transfer(const sint *h, sint *n, const sint *max, sint vi[], - const sint *mi, real vr[], const sint *mr) -{ - tuple_list tl = {*mi,0,*mr, *n,*max, vi,0,vr}; - pfindpt_transfer(findpts_handle(*h),&tl,0); - *n = tl.n; -} - - -void ftuple_list_sort(sint *n, const uint *k, sint vi[], - const sint *mi,real vr[], const sint *mr) -{ - const uint key = *k-1; /* switch to 0-based index */ - buffer buf; - tuple_list tl = {*mi,0,*mr,*n,0,vi,0,vr}; - buffer_init(&buf,65536); /* will be increased automatically if needed */ - tuple_list_sort(&tl,key,&buf); - buffer_free(&buf); -} - - -#define I_EL 1 -void findpts(const sint *h, const sint *n, sint vi[], const sint *mi, - real vr[], const sint *mr, const sint *guess) -{ - uint i; sint *ri; - tuple_list tl = {*mi,0,*mr, *n,*n, vi,0,vr}; - if(*guess) { - ri = &vi[I_EL]; - for(i=0;i -#include -#include -#include -#include -#ifdef MPI -# include -#endif - -#include "errmem.h" -#include "types.h" -#include "minmax.h" -#include "tuple_list.h" -#ifdef MPI -# include "crystal.h" -# include "transfer.h" -#else - typedef void crystal_data; -#endif - -#define OP_ADD 1 -#define OP_MUL 2 -#define OP_MIN 3 -#define OP_MAX 4 -#define OP_BPR 5 - -/*-------------------------------------------------------------------------- - Local Execution Phases - --------------------------------------------------------------------------*/ - -#define DO_SET(a,b) b=a -#define DO_ADD(a,b) a+=b -#define DO_MUL(a,b) a*=b -#define DO_MIN(a,b) if(ba) a=b -#define DO_BPR(a,b) \ - do { uint a_ = a; uint b_ = b; \ - for(;;) { if(a_>=1; else if(b_>=1; else break; } \ - a = a_; \ - } while(0) - - -#define LOOP(op) do { \ - sint i,j; \ - while((i=*cm++) != -1) \ - while((j=*cm++) != -1) \ - op(u[i],u[j]); \ -} while(0) - -static void local_condense(real *u, int op, const sint *cm) -{ - switch(op) { - case OP_ADD: LOOP(DO_ADD); break; - case OP_MUL: LOOP(DO_MUL); break; - case OP_MIN: LOOP(DO_MIN); break; - case OP_MAX: LOOP(DO_MAX); break; - case OP_BPR: LOOP(DO_BPR); break; - } -} - -static void local_uncondense(real *u, const sint *cm) -{ - LOOP(DO_SET); -} - -#undef LOOP - -#define LOOP(op) do { \ - sint i,j,k; \ - while((i=*cm++) != -1) { \ - real *pi=u+n*i; \ - while((j=*cm++) != -1) { \ - real *pj=u+n*j; \ - for(k=n;k;--k) { op(*pi,*pj); ++pi, ++pj; } \ - } \ - } \ -} while(0) - -static void local_condense_vec(real *u, uint n, int op, const sint *cm) -{ - switch(op) { - case OP_ADD: LOOP(DO_ADD); break; - case OP_MUL: LOOP(DO_MUL); break; - case OP_MIN: LOOP(DO_MIN); break; - case OP_MAX: LOOP(DO_MAX); break; - case OP_BPR: LOOP(DO_BPR); break; - } -} - -static void local_uncondense_vec(real *u, uint n, const sint *cm) -{ - LOOP(DO_SET); -} - -#undef LOOP - -/*-------------------------------------------------------------------------- - Non-local Execution Phases - --------------------------------------------------------------------------*/ - -#ifdef MPI -typedef struct { - uint id; /* processor id */ - uint np; /* number of processors to communicate with */ - uint *target; /* int target[np]: array of processor ids to comm w/ */ - uint *nshared; /* nshared[i] = number of points shared w/ target[i] */ - uint *sh_ind; /* list of shared point indices */ - MPI_Request *reqs; /* pre-allocated for MPI calls */ - real *buf; /* pre-allocated buffer to receive data */ - uint maxv; /* maximum vector size */ -} nonlocal_info; - -static nonlocal_info *nlinfo_alloc(uint np, uint count, uint maxv) -{ - nonlocal_info *info = tmalloc(nonlocal_info,1); - info->np = np; - info->target = tmalloc(uint,2*np+count); - info->nshared = info->target + np; - info->sh_ind = info->nshared + np; - info->reqs = tmalloc(MPI_Request,2*np); - info->buf = tmalloc(real,2*count*maxv); - info->maxv = maxv; - return info; -} - -static void nlinfo_free(nonlocal_info *info) -{ - free(info->buf); - free(info->reqs); - free(info->target); - free(info); -} - -static double nonlocal(real *u, int op, const nonlocal_info *info, - MPI_Comm comm) -{ - uint np = info->np, i; - MPI_Request *reqs = info->reqs; - uint *targ = info->target; - uint *nshared = info->nshared; - uint *sh_ind = info->sh_ind; - uint id = info->id; - real *buf = info->buf; -#ifdef GS_TIMING - double time0, time1; -#endif - for(i=0;ireqs,MPI_STATUSES_IGNORE); -#ifdef GS_TIMING - time1 = MPI_Wtime(); -#endif - buf = info->buf; - sh_ind = info->sh_ind; -# define LOOP(OP) do { \ - for(i=0;inp, i; - MPI_Request *reqs = info->reqs; - uint *targ = info->target; - uint *nshared = info->nshared; - uint *sh_ind = info->sh_ind; - uint id = info->id; - real *buf = info->buf; - uint size = n*sizeof(real); -#ifdef GS_TIMING - double time0, time1; -#endif - for(i=0;ireqs,MPI_STATUSES_IGNORE); -#ifdef GS_TIMING - time1 = MPI_Wtime(); -#endif - buf = info->buf; - sh_ind = info->sh_ind; -# define LOOP(OP) do { \ - for(i=0;inp, i; - MPI_Request *reqs = info->reqs; - uint *targ = info->target; - uint *nshared = info->nshared; - uint *sh_ind = info->sh_ind; - uint id = info->id; - real *buf = info->buf; -#ifdef GS_TIMING - double time0, time1; -#endif - for(i=0;ireqs,MPI_STATUSES_IGNORE); -#ifdef GS_TIMING - time1 = MPI_Wtime(); -#endif - buf = info->buf; - sh_ind = info->sh_ind; -# define LOOP(OP) do { \ - for(i=0;ilocal_cm); -#ifdef MPI - t = nonlocal(u,op,data->nlinfo,data->comm); -#endif - local_uncondense(u,data->local_cm); - return t; -} - -double ogs_op_vec(real *u, uint n, int op, const gs_data *data) -{ - double t = 0; -#ifdef MPI - if(n>data->nlinfo->maxv) - fail("%s: initialized with max vec size = %d," - " but called with vec size = %d\n",__FILE__,data->nlinfo->maxv,n); -#endif - local_condense_vec(u,n,op,data->local_cm); -#ifdef MPI - t = nonlocal_vec(u,n,op,data->nlinfo,data->comm); -#endif - local_uncondense_vec(u,n,data->local_cm); - return t; -} - -double ogs_op_many(real **u, uint n, int op, const gs_data *data) -{ - double t = 0; - uint i; -#ifdef MPI - if(n>data->nlinfo->maxv) - fail("%s: initialized with max vec size = %d," - " but called with vec size = %d\n",__FILE__,data->nlinfo->maxv,n); -#endif - for(i=0;ilocal_cm); -#ifdef MPI - t = nonlocal_many(u,n,op,data->nlinfo,data->comm); -#endif - for(i=0;ilocal_cm); - return t; -} - -/*-------------------------------------------------------------------------- - Setup - --------------------------------------------------------------------------*/ - -gs_data *gs_data_setup(uint n, const ulong *label, - uint maxv, crystal_data *crystal) -{ - gs_data *data=tmalloc(gs_data,1); - tuple_list nonzero, primary; - const int nz_index=0, nz_size=1, nz_label=0; - const int pr_nzindex=0, pr_index=1, pr_count=2, pr_size=3, pr_label=0; -#ifdef MPI - tuple_list shared; - const int pr_proc=0; - const int sh_dproc=0, sh_proc2=1, sh_index=2, sh_size=3, sh_label=0; -#else - buffer buf; -#endif -#ifdef MPI - MPI_Comm_dup(crystal->comm,&data->comm); -#else - buffer_init(&buf,1024); -#endif - - /* construct list of nonzeros: (index ^, label) */ - tuple_list_init_max(&nonzero,nz_size,1,0,n); - { - uint i; sint *nzi = nonzero.vi; slong *nzl = nonzero.vl; - for(i=0;iall->buf); -#endif - - /* build list of unique labels w/ lowest associated index: - (index in nonzero ^, primary (lowest) index in label, count, label) */ - tuple_list_init_max(&primary,pr_size,1,0,nonzero.n); - { - uint i; - sint *nzi=nonzero.vi, *pi=primary.vi; - slong *nzl=nonzero.vl, *pl=primary.vl; - sint last=-1; - for(i=0;i1) count+=pi[pr_count]+1; - data->local_cm = tmalloc(sint,count); - } - - /* sort unique labels by primary index: - (nonzero index ^2, primary index ^1, count, label ^2) */ -#ifndef MPI - tuple_list_sort(&primary,pr_index,&buf); - buffer_free(&buf); -#else - tuple_list_sort(&primary,pr_index,&crystal->all->buf); -#endif - - /* construct local condense map */ - { - uint i, n; sint *pi=primary.vi; - sint *cm = data->local_cm; - for(i=primary.n;i;--i,pi+=pr_size) if((n=pi[pr_count])>1) { - uint j; sint *nzi=nonzero.vi+nz_size*pi[pr_nzindex]; - for(j=n;j;--j,nzi+=nz_size) *cm++ = nzi[nz_index]; - *cm++ = -1; - } - *cm++ = -1; - } - tuple_list_free(&nonzero); - -#ifndef MPI - tuple_list_free(&primary); -#else - /* assign work proc by label modulo np */ - { - uint i; sint *pi=primary.vi; slong *pl=primary.vl; - for(i=primary.n;i;--i,pi+=pr_size,++pl) - pi[pr_proc]=pl[pr_label]%crystal->num; - } - transfer(1,&primary,pr_proc,crystal); /* transfer to work procs */ - /* primary: (source proc, index on src, useless, label) */ - /* sort by label */ - tuple_list_sort(&primary,pr_size+pr_label,&crystal->all->buf); - /* add sentinel to primary list */ - if(primary.n==primary.max) tuple_list_grow(&primary); - primary.vl[primary.n] = -1; - /* construct shared list: (proc1, proc2, index1, label) */ - tuple_list_init_max(&shared,sh_size,1,0,primary.n); - { - sint *pi1=primary.vi, *si=shared.vi; - slong lbl, *pl1=primary.vl, *sl=shared.vl; - for(;(lbl=pl1[pr_label])!=-1;pi1+=pr_size,++pl1) { - sint *pi2=pi1+pr_size; slong *pl2=pl1+1; - for(;pl2[pr_label]==lbl;pi2+=pr_size,++pl2) { - if(shared.n+2>shared.max) - tuple_list_grow(&shared), - si=shared.vi+shared.n*sh_size, sl=shared.vl+shared.n; - si[sh_dproc] = pi1[pr_proc]; - si[sh_proc2] = pi2[pr_proc]; - si[sh_index] = pi1[pr_index]; - sl[sh_label] = lbl; - si+=sh_size, ++sl, shared.n++; - si[sh_dproc] = pi2[pr_proc]; - si[sh_proc2] = pi1[pr_proc]; - si[sh_index] = pi2[pr_index]; - sl[sh_label] = lbl; - si+=sh_size, ++sl, shared.n++; - } - } - } - tuple_list_free(&primary); - transfer(1,&shared,sh_dproc,crystal); /* transfer to dest procs */ - /* shared list: (useless, proc2, index, label) */ - /* sort by label */ - tuple_list_sort(&shared,sh_size+sh_label,&crystal->all->buf); - /* sort by partner proc */ - tuple_list_sort(&shared,sh_proc2,&crystal->all->buf); - /* count partner procs */ - { - uint i, count=0; sint proc=-1,*si=shared.vi; - for(i=shared.n;i;--i,si+=sh_size) - if(si[sh_proc2]!=proc) ++count, proc=si[sh_proc2]; - data->nlinfo = nlinfo_alloc(count,shared.n,maxv); - { int i; MPI_Comm_rank(data->comm,&i); data->nlinfo->id=i; } - } - /* construct non-local info */ - { - uint i; sint proc=-1,*si=shared.vi; - uint *target = data->nlinfo->target; - uint *nshared = data->nlinfo->nshared; - uint *sh_ind = data->nlinfo->sh_ind; - for(i=shared.n;i;--i,si+=sh_size) { - if(si[sh_proc2]!=proc) - proc=si[sh_proc2], *target++ = proc, *nshared++ = 0; - ++nshared[-1], *sh_ind++=si[sh_index]; - } - } - tuple_list_free(&shared); -#endif - return data; -} - -void gs_data_free(gs_data *data) -{ - free(data->local_cm); -#ifdef MPI - nlinfo_free(data->nlinfo); - MPI_Comm_free(&data->comm); -#endif - free(data); -} - -void gs_data_dump(tuple_list *dump, const gs_data *data) -{ -#ifdef MPI - const nonlocal_info *info = data->nlinfo; - uint i,ip,np=info->np,n=0, mi=dump->mi; - const uint *index = info->sh_ind; - sint *out; - for(ip=0;ipnshared[ip]; - tuple_list_init_max(dump,2,0,0,n), dump->n=n; - out = dump->vi; - for(i=0,ip=0;ipnshared[ip],p=info->target[ip]; - for(j=0;jnlinfo; - uint i,np=info->np,max=0; - stats[0] = np, stats[1]=0; - for(i=0;inshared[i]; - stats[1] += n; - if(n > max) max=n; - } - if(np>0) stats[1]/=np; - stats[2]=max; -#else - stats[0]=0, stats[1]=0, stats[2]=0; -#endif -} - -/*-------------------------------------------------------------------------- - FORTRAN Interface - --------------------------------------------------------------------------*/ - -#include "fname.h" - -#define cpgs_setup FORTRAN_NAME(cpgs_setup ,CPGS_SETUP ) -#define cpgs_op FORTRAN_NAME(cpgs_op ,CPGS_OP ) -#define cpgs_op_vec FORTRAN_NAME(cpgs_op_vec ,CPGS_OP_VEC ) -#define cpgs_op_many FORTRAN_NAME(cpgs_op_many,CPGS_OP_MANY) -#define cpgs_free FORTRAN_NAME(cpgs_free ,CPGS_FREE ) - -static gs_data **cpgs_info = 0; -static int cpgs_max = 0; -static int cpgs_n = 0; - -#ifdef MPI -crystal_data *fcrystal_handle(sint h); -#endif - -void cpgs_setup(sint *handle, const sint *crystal_handle, - const slong v[], const sint *vn, const sint *maxv) -{ - uint mv = *maxv <= 1 ? 1 : *maxv; -#ifdef MPI - crystal_data *crystal = fcrystal_handle(*crystal_handle); -#else - void *crystal = 0; -#endif - if(cpgs_n==cpgs_max) cpgs_max+=cpgs_max/2+1, - cpgs_info=trealloc(gs_data*,cpgs_info,cpgs_max); - cpgs_info[cpgs_n]=gs_data_setup(*vn,(const ulong*)v,mv,crystal); - *handle = cpgs_n++; -} - -void cpgs_op(const sint *handle, real u[], const sint *op) -{ - if(*op<1 || *op>4) failwith("invalid operation to cgps_op"); - if(*handle<0 || *handle>=cpgs_n || !cpgs_info[*handle]) - failwith("invalid handle to cgps_op"); - ogs_op(u,*op,cpgs_info[*handle]); -} - -void cpgs_op_vec(const sint *handle, real u[], const sint *n, const sint *op) -{ - if(*op<1 || *op>4) failwith("invalid operation to cgps_op_vec"); - if(*handle<0 || *handle>=cpgs_n || !cpgs_info[*handle]) - failwith("invalid handle to cgps_op_vec"); - ogs_op_vec(u,*n,*op,cpgs_info[*handle]); -} - -void cpgs_op_many(const sint *handle, - real u1[], real u2[], real u3[], - real u4[], real u5[], real u6[], - const sint *n, const sint *op) -{ - real *uu[6]={u1,u2,u3,u4,u5,u6}; - if(*op<1 || *op>4) failwith("invalid operation to cgps_op_many"); - if(*handle<0 || *handle>=cpgs_n || !cpgs_info[*handle]) - failwith("invalid handle to cgps_op_many"); - ogs_op_many(uu,*n,*op,cpgs_info[*handle]); -} - -void cpgs_free(sint *handle) -{ - if(*handle<0 || *handle>=cpgs_n || !cpgs_info[*handle]) - failwith("invalid handle to cgps_free"); - gs_data_free(cpgs_info[*handle]); - cpgs_info[*handle] = 0; -} - diff --git a/3rdParty/gslib.github/src/gs_acc.c b/3rdParty/gslib.github/src/gs_acc.c deleted file mode 100755 index 2674ee04f..000000000 --- a/3rdParty/gslib.github/src/gs_acc.c +++ /dev/null @@ -1,582 +0,0 @@ -#ifdef _OPENACC -#include -#include -#include -#include -#include - -#include "c99.h" -#include "name.h" -#include "fail.h" -#include "types.h" -#define gs_op gs_op_t -#include "gs_defs.h" -#include "gs_local.h" -#include "comm.h" -#include "mem.h" -#include "sort.h" -#include "crystal.h" -#include "sarray_sort.h" -#include "sarray_transfer.h" -#define GS_ACC_C -#include "gs_acc.h" - -GS_DEFINE_DOM_SIZES() - -typedef enum { mode_plain, mode_vec, mode_many, mode_dry_run } gs_mode; - -static buffer static_buffer = null_buffer; - -struct pw_comm_data { - uint n; /* number of messages */ - uint *p; /* message source/dest proc */ - uint *size; /* size of message */ - uint total; /* sum of message sizes */ -}; - -struct pw_data { - struct pw_comm_data comm[2]; - const uint *map[2]; - comm_req *req; - uint buffer_size; -}; - -typedef void exec_fun(void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, - unsigned transpose, const void *execdata, const struct comm *comm, char *buf); - -typedef void fin_fun(void *data); - -struct gs_remote { - uint buffer_size, mem_size; - void *data; - exec_fun *exec; - fin_fun *fin; -}; - -struct gs_data { - struct comm comm; - const uint *map_local[2]; /* 0=unflagged, 1=all */ - const uint *flagged_primaries; - struct gs_remote r; - int *map_localf[2]; - int *fp_mapf; - int m_size[2]; - int fp_m_size; - int mf_nt[2]; - int fp_m_nt; - int dstride; - int u_size; - uint handle_size; -}; - -static char *pw_exec_recvs(char *buf, const unsigned unit_size, const struct comm *comm, - const struct pw_comm_data *c, comm_req *req) -{ - const uint *p, *pe, *size=c->size; - uint l=0; - l = 0; - for(p=c->p,pe=p+c->n;p!=pe;++p) { - size_t len = *(size++)*unit_size; - if(len) { - comm_irecv(req++,comm,&(buf[l]),len,*p,*p); - } - l += len; - } - return buf; -} - -static char *pw_exec_sends(char *buf, const unsigned unit_size, const struct comm *comm, - const struct pw_comm_data *c, comm_req *req) -{ - const uint *p, *pe, *size=c->size; - uint l=0; - l = 0; - for(p=c->p,pe=p+c->n;p!=pe;++p) { - size_t len = *(size++)*unit_size; - if(len) { - comm_isend(req++,comm,&(buf[l]),len,*p,comm->id); - } - l += len; - } - return buf; -} - -// -// The above is duplicated from gs.c -// The below is our "new" code -// - -#include -//It doesn't work with MPI_GET in quantum if it is 1 -#ifdef GPUDIRECT -#define USE_GPU_DIRECT 1 -#else -#define USE_GPU_DIRECT 0 -#endif - -static char *pw_exec_recvs_acc(char *buf, const unsigned unit_size, const struct comm *comm, - const struct pw_comm_data *c, comm_req *req, uint *nr) -{ - const uint *p, *pe, *size=c->size; - uint l=0; - - // Find the size used for buf so we can make it present() - for(p=c->p,pe=p+c->n;p!=pe;++p) { - l += *(size++)*unit_size; - } - size=c->size; - - // Now that we have the size, send with GPU-Direct -//#pragma acc data present(buf[0:l]) - { - l = 0; - for(p=c->p,pe=p+c->n;p!=pe;++p) { - size_t len = *(size++)*unit_size; -//#pragma acc host_data use_device(buf) - { - if(len) { - comm_irecv(req++,comm,&(buf[l]),len,*p,*p); - (*nr)++; - } - } - l += len; - } - } - - return buf; -} - -static char *pw_exec_sends_acc(char *buf, const unsigned unit_size, const struct comm *comm, - const struct pw_comm_data *c, comm_req *req, uint *nr) -{ - const uint *p, *pe, *size=c->size; - uint l=0; - - // Find the size used for buf so we can make it present() - for(p=c->p,pe=p+c->n;p!=pe;++p) { - l += *(size++)*unit_size; - } - size=c->size; - - // Now that we have the size, send with GPU-Direct -//#pragma acc data present(buf[0:l]) - { - l = 0; - for(p=c->p,pe=p+c->n;p!=pe;++p) { - size_t len = *(size++)*unit_size; -//#pragma acc host_data use_device(buf) - { - if(len) { - comm_isend(req++,comm,&(buf[l]),len,*p,comm->id); - (*nr)++; - } - } - l += len; - } - } - - return buf; -} - -static int map_size(int *map, int *t) -{ - int i,ct=0; - - *t = 0; - - // No map - if(!map) { - return 0; - } - - // "Empty" map (contains only a single -1 terminator) - if(map[0] == -1) { - return 1; - } - - // "Regular" map (contains two -1's as termination) - for(i=ct=0;ct<2;i++){ - if(map[i]==-1){ - ct++; - (*t)++; - } else { - ct=0; - } - } - (*t)--; - // printf(""); - return i; -} - -void gs_flatmap_setup_acc(const sint *handle, int n, struct gs_data **fgs_info) -{ - struct pw_data *pwd; - uint m_size,fp_m_size,snd_m_size,rcv_m_size,t_m_size; - uint i,j,k; - int m_nt,fp_m_nt,snd_m_nt,rcv_m_nt,t_m_nt; - int *map,*t_map,*fp_map,*snd_map,*rcv_map; - int *mapf,*t_mapf,*fp_mapf,*snd_mapf,*rcv_mapf; - - pwd = fgs_info[*handle]->r.data; - // Flatten... - map = (int*)(fgs_info[*handle]->map_local[0]); - t_map = (int*)(fgs_info[*handle]->map_local[1]); - fp_map = (int*)(fgs_info[*handle]->flagged_primaries); - // snd_map = (int*)(pwd->map[1]); - //rcv_map = (int*)(pwd->map[0]); - - fp_m_size = map_size(fp_map,&fp_m_nt); - m_size = map_size(map,&m_nt); - //snd_m_size = map_size(snd_map,&snd_m_nt); - //rcv_m_size = map_size(rcv_map,&rcv_m_nt); - t_m_size = map_size(t_map,&t_m_nt); - - fgs_info[*handle]->u_size = n; - fgs_info[*handle]->dstride = n; - fgs_info[*handle]->fp_m_size = fp_m_size; - fgs_info[*handle]->m_size[0] = m_size; - //fgs_info[*handle]->snd_m_size[1] = snd_m_size; - //fgs_info[*handle]->snd_m_size[0] = rcv_m_size; - fgs_info[*handle]->m_size[1] = t_m_size; - fgs_info[*handle]->m_nt[0] = m_nt; - fgs_info[*handle]->fp_m_nt = fp_m_nt; - //fgs_info[*handle]->snd_m_nt[1] = snd_m_nt; - //fgs_info[*handle]->snd_m_nt[0] = rcv_m_nt; - fgs_info[*handle]->m_nt[1] = t_m_nt; - - - mapf = (int*)malloc(m_nt*2*sizeof(int)); - for(i=0,k=0;map[i]!=-1;i=j+1,k++){ - // Record i - mapf[k*2] = i; - for(j=i+1;map[j]!=-1;j++); - // Record j-i - mapf[k*2+1] = j-i-1; - } - - t_mapf = (int*)malloc(t_m_nt*2*sizeof(int)); - for(i=0,k=0;t_map[i]!=-1;i=j+1,k++){ - // Record i - t_mapf[k*2] = i; - for(j=i+1;t_map[j]!=-1;j++); - // Record j-i - t_mapf[k*2+1] = j-i-1; - } - - fp_mapf = (int*)malloc(fp_m_nt*2*sizeof(int)); - for(k=0;kmap_localf[0] = mapf; - fgs_info[*handle]->map_localf[1] = t_mapf; - fgs_info[*handle]->fp_mapf = fp_mapf; - // fgs_info[*handle]->snd_mapf[1] = snd_mapf; - //fgs_info[*handle]->snd_mapf[0] = rcv_mapf; - -#if 0 - fprintf(stderr,"%d map[0:%d] -> %lX : %lX\n",m_nt,m_size,map,map+m_size); - fprintf(stderr,"%d t_map[0:%d] -> %lX : %lX\n",t_m_nt,t_m_size,t_map,t_map+t_m_size); - fprintf(stderr,"%d fp_map[0:%d] -> %lX : %lX\n",fp_m_nt,fp_m_size,fp_map,fp_map+fp_m_size); - fprintf(stderr,"%d snd_map[0:%d] -> %lX : %lX\n",snd_m_nt,snd_m_size,snd_map,snd_map+snd_m_size); - fprintf(stderr,"%d rcv_map[0:%d] -> %lX : %lX\n",rcv_m_nt,rcv_m_size,rcv_map,rcv_map+rcv_m_size); - fprintf(stderr,"mapf[0:%d] -> %lX : %lX\n",m_nt,mapf,mapf+2*m_nt); - fprintf(stderr,"t_mapf[0:%d] -> %lX : %lX\n",t_m_nt,t_mapf,t_mapf+2*t_m_nt); - fprintf(stderr,"fp_mapf[0:%d] -> %lX : %lX\n",fp_m_nt,fp_mapf,fp_mapf+2*fp_m_nt); - fprintf(stderr,"snd_mapf[0:%d] -> %lX : %lX\n",snd_m_nt,snd_mapf,snd_mapf+2*snd_m_nt); - fprintf(stderr,"rcv_mapf[0:%d] -> %lX : %lX\n",rcv_m_nt,rcv_mapf,rcv_mapf+2*rcv_m_nt); -#endif - - //#pragma acc enter data copyin(t_mapf[0:t_m_nt*2],mapf[0:m_nt*2],snd_mapf[0:snd_m_nt*2],rcv_mapf[0:rcv_m_nt*2],fp_mapf[0:fp_m_nt*2], t_map[0:t_m_size],map[0:m_size],fp_map[0:fp_m_size],snd_map[0:snd_m_size],rcv_map[0:rcv_m_size]) - - return; -} - -void fgs_fields_acc(const sint *handle, double *u, const sint *stride, const sint *n, - const sint *dom, const sint *op, const sint *transpose, - struct gs_data **fgs_info) -{ - struct pw_data *pwd; - struct comm *comm; - buffer *buf; - const unsigned recv = 0^*transpose, send = 1^*transpose; - uint i,j,k; - uint bs,bl,uds,dstride,dtrans,vn,nr; - uint m_size,fp_m_size,snd_m_size,rcv_m_size,t_m_size; - int m_nt,fp_m_nt,snd_m_nt,rcv_m_nt,t_m_nt; - int *map,*t_map,*fp_map,*snd_map,*rcv_map; - int *mapf,*t_mapf,*fp_mapf,*snd_mapf,*rcv_mapf; - double *dbufp,*sbuf,*rbuf; - double t; - - char hname[1024]; - static int calls=0; - - // Flatten... - dstride = *stride; - dtrans = *transpose; - vn = *n; - - // Create temp buffer for gather/scatter and send/recv - buf = &static_buffer; - bs = (*n)*sizeof(double)*(fgs_info[*handle]->r.buffer_size); - bl = (bs/sizeof(double))/2; - //buffer_reserve(buf,bs); - //dbufp = (double*)buf->ptr; - //sbuf = dbufp; - //rbuf = &(dbufp[bl]); - sbuf = malloc(bl*sizeof(double)); - rbuf = malloc(bl*sizeof(double)); - uds = fgs_info[*handle]->u_size * (*n); // Get size of u in number of doubles - pwd = fgs_info[*handle]->r.data; - comm = &fgs_info[*handle]->comm; - /* if(calls==0) { */ - /* gs_flatmap_setup_acc(handle,fgs_info); */ - /* } */ - // Flatten... - map = (int*)(fgs_info[*handle]->map_local[0^*transpose]); - t_map = (int*)(fgs_info[*handle]->map_local[1^*transpose]); - fp_map = (int*)(fgs_info[*handle]->flagged_primaries); - snd_map = (int*)(pwd->map[send]); - rcv_map = (int*)(pwd->map[recv]); - fp_m_size = fgs_info[*handle]->fp_m_size; - m_size = fgs_info[*handle]->m_size[0^*transpose]; - snd_m_size = fgs_info[*handle]->snd_m_size[send]; - rcv_m_size = fgs_info[*handle]->snd_m_size[recv]; - t_m_size = fgs_info[*handle]->m_size[1^*transpose]; - fp_m_nt = fgs_info[*handle]->fp_m_nt; - m_nt = fgs_info[*handle]->m_nt[0^*transpose]; - snd_m_nt = fgs_info[*handle]->snd_m_nt[send]; - rcv_m_nt = fgs_info[*handle]->snd_m_nt[recv]; - t_m_nt = fgs_info[*handle]->m_nt[1^*transpose]; - - //Retrieve flattened maps - mapf = (int*)(fgs_info[*handle]->mapf[0^*transpose]); - t_mapf = (int*)(fgs_info[*handle]->mapf[1^*transpose]); - fp_mapf = (int*)(fgs_info[*handle]->fp_mapf); - snd_mapf = (int*)(fgs_info[*handle]->snd_mapf[send]); - rcv_mapf = (int*)(fgs_info[*handle]->snd_mapf[recv]); - - -#if 0 - calls++; - gethostname(hname, sizeof(hname)); - //fprintf(stderr,"%s: enter %d\n",hname,calls); -#endif -#if 0 -#endif -#if 0 - fprintf(stderr,"%s: map[0:%d] -> %lX : %lX\n",hname,m_size,map,map+m_size); - fprintf(stderr,"%s: t_map[0:%d] -> %lX : %lX\n",hname,t_m_size,t_map,t_map+t_m_size); - fprintf(stderr,"%s: fp_map[0:%d] -> %lX : %lX\n",hname,fp_m_size,fp_map,fp_map+fp_m_size); - fprintf(stderr,"%s: snd_map[0:%d] -> %lX : %lX\n",hname,snd_m_size,snd_map,snd_map+snd_m_size); - fprintf(stderr,"%s: rcv_map[0:%d] -> %lX : %lX\n",hname,rcv_m_size,rcv_map,rcv_map+rcv_m_size); -#endif -#if 0 - fprintf(stderr,"%d map[0:%d] -> %lX : %lX\n",m_nt,m_size,map,map+m_size); - fprintf(stderr,"%d t_map[0:%d] -> %lX : %lX\n",t_m_nt,t_m_size,t_map,t_map+t_m_size); - fprintf(stderr,"%d fp_map[0:%d] -> %lX : %lX\n",fp_m_nt,fp_m_size,fp_map,fp_map+fp_m_size); - fprintf(stderr,"%d snd_map[0:%d] -> %lX : %lX\n",snd_m_nt,snd_m_size,snd_map,snd_map+snd_m_size); - fprintf(stderr,"%d rcv_map[0:%d] -> %lX : %lX\n",rcv_m_nt,rcv_m_size,rcv_map,rcv_map+rcv_m_size); - fprintf(stderr,"mapf[0:%d] -> %lX : %lX\n",m_nt,mapf,mapf+2*m_nt); - fprintf(stderr,"t_mapf[0:%d] -> %lX : %lX\n",t_m_nt,t_mapf,t_mapf+2*t_m_nt); - fprintf(stderr,"fp_mapf[0:%d] -> %lX : %lX\n",fp_m_nt,fp_mapf,fp_mapf+2*fp_m_nt); - fprintf(stderr,"snd_mapf[0:%d] -> %lX : %lX\n",snd_m_nt,snd_mapf,snd_mapf+2*snd_m_nt); - fprintf(stderr,"rcv_mapf[0:%d] -> %lX : %lX\n",rcv_m_nt,rcv_mapf,rcv_mapf+2*rcv_m_nt); -#endif - - // if(calls==0) { - //#pragma acc enter data copyin(t_mapf[0:t_m_nt*2],mapf[0:m_nt*2],snd_mapf[0:snd_m_nt*2],rcv_mapf[0:rcv_m_nt*2],fp_mapf[0:fp_m_nt*2], t_map[0:t_m_size],map[0:m_size],fp_map[0:fp_m_size],snd_map[0:snd_m_size],rcv_map[0:rcv_m_size]) -//#pragma acc enter data pcopyin(t_mapf[0:t_m_nt*2],mapf[0:m_nt*2],snd_mapf[0:snd_m_nt*2],rcv_mapf[0:rcv_m_nt*2],fp_mapf[0:fp_m_nt*2], t_map[0:t_m_size],map[0:m_size],fp_map[0:fp_m_size],snd_map[0:snd_m_size],rcv_map[0:rcv_m_size]) - //calls=0; - //} - //calls++; -//#pragma acc data present(u[0:uds],mapf[0:m_nt*2],snd_mapf[0:snd_m_nt*2],rcv_mapf[0:rcv_m_nt*2],fp_mapf[0:fp_m_nt*2],t_map[0:t_m_size],map[0:m_size],fp_map[0:fp_m_size],snd_map[0:snd_m_size],rcv_map[0:rcv_m_size]) - { -//#pragma acc data create(sbuf[0:bl],rbuf[0:bl]) if(bl!=0) - { - // The below implementing cgs_many()/gs_aux(): - // - // gs_aux_acc(u,mode_many,dn,fgs_dom[*dom],(gs_op_t)(*op-1),*transpose!=0,fgs_info[*handle],NULL,us); - // - - // gs_gather_many_acc(u,u,vn,gsh->map_local[0^transpose],dom,op); - for(k=0;kflagged_primaries,dom,op); - for(k=0;kflagged_primaries,dom,op); -//#pragma acc parallel loop gang vector present(u[0:uds],fp_map[0:fp_m_size]) private(i,k) - for(k=0;kcomm[recv],pwd->req,&nr); -#else - pw_exec_recvs((char*)rbuf,vn*sizeof(double),comm,&pwd->comm[recv],pwd->req); -#endif - /* fill send buffer */ - // gs_scatter_many_to_vec_acc(sendbuf,data,vn,pwd->map[send],dom); - for(k=0;kcomm[send],&pwd->req[nr],&nr); - comm_wait(pwd->req,nr); -#else -//#pragma acc update host(sbuf[0:bl]) async(vn+2) -//#pragma acc wait - pw_exec_sends((char*)sbuf,vn*sizeof(double),comm,&pwd->comm[send],&pwd->req[pwd->comm[recv].n]); - comm_wait(pwd->req,pwd->comm[0].n+pwd->comm[1].n); -//#pragma acc update device(rbuf[0:bl]) async(vn+2) -//#pragma acc wait -#endif - /* gather using recv buffer */ - // gs_gather_vec_to_many_acc(data,buf,vn,pwd->map[recv],dom,op); - for(k=0;kmap_local[1^transpose],dom); - for(k=0;k multiple arrays - Scatter from multiple arrays -> strided array, - Scatter from strided array -> multiple arrays, -------------------------------------------------------------------------------*/ -void gs_gather_vec_to_many_acc(void *out, const void *in, const unsigned vn, - const uint *map, gs_dom dom, gs_op op) -{ - unsigned i; const unsigned unit_size = gs_dom_size[dom]; - typedef void *ptr_to_void; - const ptr_to_void *p = out; const char *q = in; -#define WITH_OP(T,OP) \ - for(i=vn;i;--i) gather_##T##_##OP##_acc(*p++,(const T*)q,vn,map), q+=unit_size -#define WITH_DOMAIN(T) SWITCH_OP(T,op) - SWITCH_DOMAIN(dom); -#undef WITH_DOMAIN -#undef WITH_OP -} - -void gs_scatter_many_to_vec_acc(void *out, const void *in, const unsigned vn, - const uint *map, gs_dom dom) -{ - unsigned i; const unsigned unit_size = gs_dom_size[dom]; - typedef const void *ptr_to_const_void; - char *p = out; const ptr_to_const_void *q = in; -#define WITH_DOMAIN(T) \ - for(i=vn;i;--i) scatter_##T##_acc((T*)p,vn,*q++,1,map), p+=unit_size - SWITCH_DOMAIN(dom); -#undef WITH_DOMAIN -} - diff --git a/3rdParty/gslib.github/src/gs_test.c b/3rdParty/gslib.github/src/gs_test.c deleted file mode 100644 index 588a52b17..000000000 --- a/3rdParty/gslib.github/src/gs_test.c +++ /dev/null @@ -1,68 +0,0 @@ -#include -#include -#include -#include -#include "c99.h" -#include "name.h" -#include "fail.h" -#include "types.h" -#include "comm.h" -#include "mem.h" -#include "gs_defs.h" -#include "gs.h" - -typedef double T; -const gs_dom dom = gs_double; - -static void test(const struct comm *comm) -{ - struct gs_data *gsh; - const uint np = comm->np; - slong *id = tmalloc(slong,np+4); - T *v = tmalloc(T,np+4); - uint i; - id[0] = -(slong)(np+10+3*comm->id); - for(i=0;iid+1; - id[np+2] = comm->id+1; - id[np+3] = np-comm->id; - gsh = gs_setup(id,np+4,comm,0,gs_auto,1); - free(id); - - for(i=0;iid==0) for(i=0;iid==0) printf("\n"); - for(i=0;iid==0) for(i=0;i and "types.h" */ - -#ifndef TYPES_H -#warning "minmax.h" depends on "types.h" -#endif - -/*-------------------------------------------------------------------------- - Min, Max, Norm - --------------------------------------------------------------------------*/ - -#define DECLMINMAX(type, prefix) \ -static type prefix##min_2(type a, type b) { return bb?a:b; } \ -static void prefix##minmax_2(type *min, type *max, type a, type b) \ -{ if(bb?(a>c?a:c):(b>c?b:c); } \ -static void prefix##minmax_3(type *min, type *max, type a, type b, type c) \ -{ if(b -#include -#include -#include -#include "errmem.h" -#include "types.h" -#include "sort.h" -#include "tuple_list.h" - -const unsigned mi=2, ml=2, mr=2; - -void test1() -{ - buffer buf; - uint i,j; - tuple_list tl; - buffer_init(&buf,1024); - tuple_list_init_max(&tl,mi,ml,mr,500); - tl.n=tl.max; - for(i=0;i -#include -#include -#include -#ifdef MPI -# include -# include -#endif - -#include "errmem.h" -#include "types.h" -#include "minmax.h" -#include "poly.h" -#include "tensor.h" -#include "findpt.h" -#include "tuple_list.h" - -#ifdef MPI -# include "crystal.h" -# include "transfer.h" -#else -typedef void crystal_data; -#endif - - -#ifdef MPI - -static int iceil(real x) { return ceilr(x); } -static int ifloor(real x) { return floorr(x); } - -typedef struct { - real bnd[6], fac[3]; - unsigned n, ncell; - uint *cell_offset, *cell; -} hash_data; - -static void hash_free(hash_data *h) -{ - free(h->cell_offset), free(h->cell); -} - -static void hash_range(const hash_data *h, const real *bnd, unsigned d, - unsigned *ia, unsigned *ib) -{ - const real a = bnd[d*2 ], b = bnd[d*2+1]; - const int i0 = ifloor( (a - h->bnd[d*2]) * h->fac[d] ); - const unsigned i1 = iceil( (b - h->bnd[d*2]) * h->fac[d] ); - *ia = imax_2(0,i0); - *ib = umin_2(i1,h->n); - if(*ib == *ia) ++(*ib); -} - -static void hash_size_calc(hash_data *h, unsigned ndim, const real *bnd, - const crystal_data *crystal) -{ - unsigned d; uint nn; double out[3], in[3]; - - for(d=0;dcomm); - for(d=0;dbnd[d*2]=in[d]; - - for(d=0;dcomm); - for(d=0;dbnd[d*2+1]=in[d]; - - h->n = iceil(pow(crystal->num,1./ndim)); - for(d=0;dfac[d] = h->n/(h->bnd[d*2+1] - h->bnd[d*2]); -/* - h->fac[1] = h->n/(h->bnd[3] - h->bnd[2]); - h->fac[2] = h->n/(h->bnd[5] - h->bnd[4]); -*/ - nn=(uint)h->n*h->n; if(ndim==3) nn*=h->n; - h->ncell = (nn-1)/crystal->num+1; - if(crystal->id+(h->ncell-1)*crystal->num>=nn) --h->ncell; - h->cell_offset = tmalloc(uint,h->ncell+1); -} - -static void hash_build(hash_data *h, unsigned ndim, const real *bnd, - crystal_data *crystal) -{ - tuple_list tl; sint *work; - unsigned range[6]={0,1,0,1,0,1}; - uint i, j, k; uint count=1; - - hash_size_calc(h,ndim,bnd,crystal); - - for(i=0;in+j)*h->n+i; - *work++ = index; - *work++ = index%crystal->num; - } - } - } - - transfer(1,&tl,1,crystal); - - /* sort on hash index (equiv. to cell) */ - tuple_list_sort(&tl,0,&crystal->all->buf); - - h->cell=tmalloc(uint,tl.n); - j=0; h->cell_offset[0]=0; h->cell_offset[j+1]=0; - work = tl.vi; - for(i=tl.n;i;--i,work+=2) { - const uint cell = work[0]/crystal->num, - src = work[1]; - for(k=j+1;k<=cell;++k) h->cell_offset[k+1]=h->cell_offset[k]; - j=cell; - h->cell[h->cell_offset[j+1]++] = src; - } - for(k=j+1;kncell;++k) h->cell_offset[k+1]=h->cell_offset[k]; - tuple_list_free(&tl); -} - -#endif - -typedef struct { - unsigned ndim; - void *fd; - findpt_func findpt; -#ifdef MPI - hash_data hash; - crystal_data *crystal; - tuple_list tl_hash, /* vi:{id,p,cell} vd:{x,y,z} */ - tl_in, /* vi:{id,p,sp} vd:{x,y,z} */ - tl_out; /* vi:{id,p,el,code} vd:{r,s,t,dist} */ -#endif -} pfindpt_data; - -#define IW_ID 0 -#define IW_PROC 1 -#define IW_CELL 2 -#define IW_SRCP 2 -#define IW_EL 2 -#define IW_CODE 3 - -#define DW_X 0 -#define DW_R 0 -#define DW_DIST ndim - -pfindpt_data *pfindpt_setup(unsigned ndim, const real *const*xw, - const unsigned *n, uint nel, - uint max_hash_size, real bbox_tol, - crystal_data *crystal) -{ - pfindpt_data *p = tmalloc(pfindpt_data,1); - const real *bnd; - p->ndim = ndim; -#ifdef MPI - p->crystal = crystal; -#endif - if(ndim==2) { - p->fd = findpt_setup_2(xw,n,nel,max_hash_size,bbox_tol); - p->findpt = (findpt_func)&findpt_2; - bnd = findpt_allbnd_2((findpt_data_2*)p->fd); - } else if(ndim==3) { - p->fd = findpt_setup_3(xw,n,nel,max_hash_size,bbox_tol); - p->findpt = (findpt_func)&findpt_3; - bnd = findpt_allbnd_3((findpt_data_3*)p->fd); - } else - bnd=0,fail("%s: pfindpt_setup: parameter ndim=%u invalid",__FILE__,ndim); -#ifdef MPI - hash_build(&p->hash,ndim,bnd,p->crystal); - tuple_list_init(&p->tl_hash,3,0,ndim); - tuple_list_init(&p->tl_in,3,0,ndim); - tuple_list_init(&p->tl_out,4,0,ndim+1); -#endif - return p; -} - -void pfindpt_free(pfindpt_data *p) -{ - if(p->ndim==2) findpt_free_2(p->fd); else findpt_free_3(p->fd); -#ifdef MPI - hash_free(&p->hash); - tuple_list_free(&p->tl_hash); - tuple_list_free(&p->tl_in); - tuple_list_free(&p->tl_out); -#endif - free(p); -} - -#define I_PROC 0 -#define I_EL 1 -#define I_CODE 2 - -#define D_DIST 0 -#define D_X 1 -#define D_R (D_X+ndim) - -#define GROW(a,b) (a+a/2+1>b?a+a/2+1:b) - -void pfindpt_transfer(pfindpt_data *p, tuple_list *list, int dynamic) -{ -#ifdef MPI - transfer(dynamic,list,I_PROC,p->crystal); -#endif -} - -void pfindpt(pfindpt_data *p, tuple_list *list, int guess) -{ - const int lmi=list->mi, lmr=list->mr; - const int ndim=p->ndim; int d; - uint i, id; - sint *ri; real *rr; -#ifdef MPI - sint *oi; real *or; - const uint np=p->crystal->num; - p->tl_hash.n = p->tl_in.n = p->tl_out.n = 0; - if(p->tl_hash.maxn) - tuple_list_resize(&p->tl_hash,GROW(p->tl_hash.max,list->n)); - oi = p->tl_hash.vi, or = p->tl_hash.vr; - id = p->crystal->id; -#else - id = 0; -#endif - ri=list->vi,rr=list->vr; - for(i=0;in;++i,ri+=lmi,rr+=lmr) { - ri[I_PROC]=id; - ri[I_CODE]=p->findpt(p->fd,&rr[D_X],guess, - (uint*)&ri[I_EL],&rr[D_R],&rr[D_DIST]); -#ifdef MPI - if(ri[I_CODE]!=0) { /* point not found, or point on boundary */ - int hi; uint hash=0; - for(d=ndim-1;d>=0;--d) { - hi = ifloor((rr[D_X+d]-p->hash.bnd[d*2])*p->hash.fac[d]); - if(hi<0 || (unsigned)hi>=p->hash.n) { ri[I_CODE]=-1; break; } - hash = hash*p->hash.n+hi; - } - if(d>=0) continue; - oi[IW_ID]=i; - oi[IW_PROC]=hash%np; - oi[IW_CELL]=hash/np; - memcpy(or,&rr[D_X],ndim*sizeof(real)); - oi+=3, or+=ndim, ++p->tl_hash.n; - } -#endif - } - -#ifdef MPI - - transfer(1,&p->tl_hash,IW_PROC,p->crystal); - - ri=p->tl_hash.vi,rr=p->tl_hash.vr; - oi=p->tl_in.vi,or=p->tl_in.vr; - for(i=0;itl_hash.n;++i,ri+=3,rr+=ndim) { - uint *proc = p->hash.cell + p->hash.cell_offset[ri[IW_CELL]], - *pend = p->hash.cell + p->hash.cell_offset[ri[IW_CELL]+1]; - for(;proc!=pend;++proc) { - if((sint)*proc==ri[IW_PROC]) continue; - if(p->tl_in.n==p->tl_in.max) { - tuple_list_grow(&p->tl_in); - oi=p->tl_in.vi+3*p->tl_in.n; - or=p->tl_in.vr+ndim*p->tl_in.n; - } - oi[IW_ID]=ri[IW_ID]; - oi[IW_SRCP]=ri[IW_PROC]; - oi[IW_PROC]=*proc; - memcpy(or,rr,ndim*sizeof(real)); - oi+=3, or+=ndim, ++p->tl_in.n; - } - } - - transfer(1,&p->tl_in,IW_PROC,p->crystal); - - if(p->tl_out.maxtl_in.n) - tuple_list_resize(&p->tl_out,GROW(p->tl_out.max,p->tl_in.n)); - ri=p->tl_in.vi,rr=p->tl_in.vr; - oi=p->tl_out.vi,or=p->tl_out.vr; - for(i=0;itl_in.n;++i,ri+=3,rr+=ndim) { - oi[IW_CODE]=p->findpt(p->fd,&rr[DW_X],0, - (uint*)&oi[IW_EL],&or[DW_R],&or[DW_DIST]); - if(oi[IW_CODE]==-1) continue; - oi[IW_ID]=ri[IW_ID]; - oi[IW_PROC]=ri[IW_SRCP]; - oi+=4, or+=ndim+1, ++p->tl_out.n; - } - - transfer(1,&p->tl_out,IW_PROC,p->crystal); - - ri=p->tl_out.vi,rr=p->tl_out.vr; - for(i=0;itl_out.n;++i,ri+=4,rr+=ndim+1) { - oi = list->vi+lmi*ri[IW_ID]; - or = list->vr+lmr*ri[IW_ID]; - if(oi[I_CODE]!=0) { - if(ri[IW_CODE]==1 && oi[I_CODE]==1 && rr[DW_DIST]>=or[D_DIST]) continue; - oi[I_PROC] = ri[IW_PROC]; - oi[I_CODE] = ri[IW_CODE]; - or[D_DIST] = rr[DW_DIST]; - oi[I_EL] = ri[IW_EL]; - memcpy(&or[D_R],&rr[DW_R],ndim*sizeof(real)); - } - } - -#endif -} - -void pfindpt_weights(pfindpt_data *p, const real *r) -{ - if(p->ndim==2) findpt_weights_2(p->fd,r); - else findpt_weights_3(p->fd,r); -} - -real pfindpt_eval(pfindpt_data *p, const real *u) -{ - if(p->ndim==2) return findpt_eval_2(p->fd,u); - else return findpt_eval_3(p->fd,u); -} - diff --git a/3rdParty/gslib.github/src/pfindpt.h b/3rdParty/gslib.github/src/pfindpt.h deleted file mode 100644 index d43b52679..000000000 --- a/3rdParty/gslib.github/src/pfindpt.h +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef PFINDPT_H -#define PFINDPT_H - -/* requires "types.h", "tuple_list.h", and, - when MPI is defined, "crystal.h" */ -#if !defined(TYPES_H) || !defined(TUPLE_LIST_H) || \ - ( defined(MPI) && !defined(CRYSTAL_H) ) -#warning "pfindpt.h" requires "types.h", "tuple_list.h", and "crystal.h" -#endif - -typedef struct pfindpt_data_ pfindpt_data; - -#ifndef MPI -# define crystal_data void -#endif - -pfindpt_data *pfindpt_setup(unsigned ndim, const real *const*xw, - const unsigned *n, uint nel, - uint max_hash_size, real bbox_tol, - crystal_data *crystal); - -#ifndef MPI -# undef crystal_data -#endif - -void pfindpt_free(pfindpt_data *p); -void pfindpt_transfer(pfindpt_data *p, tuple_list *list, int dynamic); -void pfindpt(pfindpt_data *p, tuple_list *list, int guess); -void pfindpt_weights(pfindpt_data *p, const real *r); -real pfindpt_eval(pfindpt_data *p, const real *u); - -#endif - diff --git a/3rdParty/gslib.github/src/pfindpt_test.c b/3rdParty/gslib.github/src/pfindpt_test.c deleted file mode 100644 index 5ab25da6a..000000000 --- a/3rdParty/gslib.github/src/pfindpt_test.c +++ /dev/null @@ -1,88 +0,0 @@ -#include -#include -#include -#include -#ifdef MPI -#include -#endif - -#include "errmem.h" -#include "types.h" -#include "poly.h" -#include "tuple_list.h" -#include "crystal.h" -#include "pfindpt.h" - -#define N 16 -#define NNN (N*N*N) - -real lobz[N]; -unsigned ord[3]={N,N,N}; -real elx[NNN], ely[NNN], elz[NNN]; -real *const xw[3] = {elx,ely,elz}; - -pfindpt_data *p; - -int main(int narg, char* arg[]) -{ - int id=0,np=1,i,j,k; -#ifndef MPI - void *crystal=0; -#else - crystal_data cd, *crystal=&cd; - MPI_Comm comm; - MPI_Init(&narg,&arg); - MPI_Comm_dup(MPI_COMM_WORLD,&comm); - MPI_Comm_rank(comm,&id); - MPI_Comm_size(comm,&np); - crystal_init(crystal,comm); -#endif - - lobatto_nodes(lobz,N); - for(k=0;k -#include -#include -#include /* for cos, fabs */ -#include /* for memcpy */ -#include - -#include "errmem.h" -#include "types.h" - -/* - For brevity's sake, some names have been shortened - Quadrature rules - Gauss -> Gauss-Legendre quadrature (open) - Lobatto -> Gauss-Lobatto-Legendre quadrature (closed at both ends) - Polynomial bases - Legendre -> Legendre basis - Gauss -> Lagrangian basis using Gauss quadrature nodes - Lobatto -> Lagrangian basis using Lobatto quadrature nodes -*/ - -/*-------------------------------------------------------------------------- - Legendre Polynomial Matrix Computation - (compute P_i(x_j) for i = 0, ..., n and a given set of x) - --------------------------------------------------------------------------*/ - -/* precondition: n >= 1 - inner index is x index (0 ... m-1); - outer index is Legendre polynomial number (0 ... n) - */ -void legendre_matrix(const real *x, int m, real *P, int n) -{ - int i,j; - real *Pjm1 = P, *Pj = Pjm1+m, *Pjp1 = Pj+m; - for(i=0; i= 1, n even */ -static void legendre_row_even(real x, real *P, int n) -{ - real p[2] = {1, x}; - int i; - P[0] = 1, P[1] = x; - for(i=1; i<=n-2; i+=2) { - p[0] = ((2*i+1)*x*p[1]- i *p[0])/(i+1); - p[1] = ((2*i+3)*x*p[0]-(i+1)*p[1])/(i+2); - P[i+1] = p[0]; - P[i+2] = p[1]; - } - P[n] = ((2*n-1)*x*p[1]-(n-1)*p[0])/n; -} - -/* precondition: n >= 1, n odd */ -static void legendre_row_odd(real x, real *P, int n) -{ - real p[2] = {1, x}; - int i; - P[0] = 1, P[1] = x; - for(i=1; i<=n-2; i+=2) { - p[0] = ((2*i+1)*x*p[1]- i *p[0])/(i+1); - p[1] = ((2*i+3)*x*p[0]-(i+1)*p[1])/(i+2); - P[i+1] = p[0]; - P[i+2] = p[1]; - } -} - -/* precondition: n >= 1 - compute P_i(x) with i = 0 ... n - */ -void legendre_row(real x, real *P, int n) -{ - if(n&1) legendre_row_odd(x,P,n); else legendre_row_even(x,P,n); -} - -/* precondition: n >= 1 - inner index is Legendre polynomial number (0 ... n) - outer index is x index (0 ... m-1); - */ -void legendre_matrix_t(const real *x, int m, real *P, int n) -{ - int i; - if(n&1) for(i=0;i= 0 */ -static real legendre(int n, real x) -{ - real p[2] = {1, x}; - int i; - for(i=1; i 0 */ -static real legendre_d1(int n, real x) -{ - real p[2] = {3*x, 1}; - int i; - for(i=2; i 1 */ -static real legendre_d2(int n, real x) -{ - real p[2] = {3, 15*x}; - int i; - for(i=3; i-x*EPS); - z[i] = x - legendre(n,x)/legendre_d1(n,x); - } - if(n&1) z[n/2]=0; - for(j=(n+1)/2,i=n/2-1; j-x*EPS); - z[i] = x - legendre_d1(np,x)/legendre_d2(np,x); - } - if(n&1) z[n/2]=0; - for(j=(n+1)/2,i=n/2-1; j= 2 - given the Gauss quadrature rule (z,w,n), compute the square matrix J - for transforming from the Gauss basis to the Legendre basis: - - u_legendre(i) = sum_j J(i,j) u_gauss(j) - - computes J = .5 (2i+1) w P (z ) - ij j i j - - in column major format (inner index is i, the Legendre index) - */ -void gauss_to_legendre(const real *z, const real *w, int n, real *J) -{ - int i,j; - legendre_matrix_t(z,n,J,n-1); - for(j=0;j= 2 - same as above, but - in row major format (inner index is j, the Gauss index) - */ -void gauss_to_legendre_t(const real *z, const real *w, int n, real *J) -{ - int i,j; - legendre_matrix(z,n,J,n-1); - for(i=0;i= 3 - given the Lobatto quadrature rule (z,w,n), compute the square matrix J - for transforming from the Gauss basis to the Legendre basis: - - u_legendre(i) = sum_j J(i,j) u_lobatto(j) - - in column major format (inner index is i, the Legendre index) - */ -void lobatto_to_legendre(const real *z, const real *w, int n, real *J) -{ - int i,j,m=(n+1)/2; - real *p = J, *q; - real ww, sum; - if(n&1) - for(j=0;jJ [0 ... n-1] interpolation weights - p->D [0 ... n-1] 1st derivative weights - p->D2[0 ... n-1] 2nd derivative weights - are computed for a given x with: - lagrange_0(p,x); // compute p->J - lagrange_1(p,x); // compute p->J, p->D - lagrange_2(p,x); // compute p->J, p->D, p->D2 - lagrange_2u(p); // compute p->D2 after call of lagrange_1(p,x); - These functions use the z array supplied to setup - (that pointer should not be freed between calls) - Weights for x=z[0] and x=z[n-1] are computed during setup; access as: - p->J_z0, etc. and p->J_zn, etc. - - lagrange_free(&p); // deallocate memory allocated by setup - --------------------------------------------------------------------------*/ - -typedef struct { - unsigned n; /* number of Lagrange nodes */ - const real *z; /* Lagrange nodes (user-supplied) */ - real *J, *D, *D2; /* weights for 0th,1st,2nd derivatives */ - real *J_z0, *D_z0, *D2_z0; /* ditto at z[0] (computed at setup) */ - real *J_zn, *D_zn, *D2_zn; /* ditto at z[n-1] (computed at setup) */ - real *w, *d, *u0, *v0, *u1, *v1, *u2, *v2; /* work data */ -} lagrange_data; - -static void lagrange_0(lagrange_data *p, real x) -{ - unsigned i, n=p->n; - for(i=0 ; id[i] = x-p->z[i]; - for(i=0 ; iu0[i+1] = p->d[i]*p->u0[i]; - for(i=n-1; i ; --i) p->v0[i-1] = p->d[i]*p->v0[i]; - for(i=0 ; iJ[i] = p->w[i]*p->u0[i]*p->v0[i]; -} - -static void lagrange_1(lagrange_data *p, real x) -{ - unsigned i, n=p->n; - for(i=0 ; id[i] = x-p->z[i]; - for(i=0 ; iu0[i+1] = p->d[i]*p->u0[i], - p->u1[i+1] = p->d[i]*p->u1[i] + p->u0[i]; - for(i=n-1; i ; --i) - p->v0[i-1] = p->d[i]*p->v0[i], - p->v1[i-1] = p->d[i]*p->v1[i] + p->v0[i]; - for(i=0 ; iJ[i] = p->w[i]*p->u0[i]*p->v0[i], - p->D[i] = p->w[i]*(p->u1[i]*p->v0[i]+p->u0[i]*p->v1[i]); -} - -static void lagrange_2(lagrange_data *p, real x) -{ - unsigned i,n=p->n; - for(i=0 ; id[i] = x-p->z[i]; - for(i=0 ; iu0[i+1]=p->d[i]*p->u0[i], - p->u1[i+1]=p->d[i]*p->u1[i]+p->u0[i], - p->u2[i+1]=p->d[i]*p->u2[i]+2*p->u1[i]; - for(i=n-1; i ; --i) - p->v0[i-1]=p->d[i]*p->v0[i], - p->v1[i-1]=p->d[i]*p->v1[i]+p->v0[i], - p->v2[i-1]=p->d[i]*p->v2[i]+2*p->v1[i]; - for(i=0 ; iJ [i]=p->w[i]*p->u0[i]*p->v0[i], - p->D [i]=p->w[i]*(p->u1[i]*p->v0[i]+p->u0[i]*p->v1[i]), - p->D2[i]=p->w[i]*(p->u2[i]*p->v0[i]+2*p->u1[i]*p->v1[i]+p->u0[i]*p->v2[i]); -} - -static void lagrange_2u(lagrange_data *p) -{ - unsigned i,n=p->n; - for(i=0 ; iu2[i+1]=p->d[i]*p->u2[i]+2*p->u1[i]; - for(i=n-1; i ; --i) - p->v2[i-1]=p->d[i]*p->v2[i]+2*p->v1[i]; - for(i=0 ; iD2[i]=p->w[i]*(p->u2[i]*p->v0[i]+2*p->u1[i]*p->v1[i]+p->u0[i]*p->v2[i]); -} - -void lagrange_setup(lagrange_data *p, const real *z, unsigned n) -{ - unsigned i,j; - p->n=n, p->z=z; - p->w = tmalloc(real, 17*n); - p->d = p->w+n; - p->J = p->d+n, p->D = p->J+n, p->D2 = p->D+n; - p->u0=p->D2+n, p->v0=p->u0+n; - p->u1=p->v0+n, p->v1=p->u1+n; - p->u2=p->v1+n, p->v2=p->u2+n; - p->J_z0=p->v2+n, p->D_z0=p->J_z0+n, p->D2_z0=p->D_z0+n; - p->J_zn=p->D2_z0+n, p->D_zn=p->J_zn+n, p->D2_zn=p->D_zn+n; - for(i=0; iw[i] = 1/ww; - } - p->u0[0] = p->v0[n-1] = 1; - p->u1[0] = p->v1[n-1] = 0; - p->u2[0] = p->v2[n-1] = 0; - lagrange_2(p,z[0 ]); memcpy(p->J_z0,p->J,3*n*sizeof(real)); - lagrange_2(p,z[n-1]); memcpy(p->J_zn,p->J,3*n*sizeof(real)); -} - -void lagrange_free(lagrange_data *p) -{ - free(p->w); -} - diff --git a/3rdParty/gslib.github/src/poly_test2.c b/3rdParty/gslib.github/src/poly_test2.c deleted file mode 100644 index a72be7102..000000000 --- a/3rdParty/gslib.github/src/poly_test2.c +++ /dev/null @@ -1,374 +0,0 @@ -#include -#include -#include -#include -#include -#include "c99.h" -#include "types.h" -#include "name.h" -#include "fail.h" -#include "mem.h" -#include "poly.h" -#include "rdtsc.h" - -#define N 32 -#define REPEAT 1000000 - -#define USE_HW_COUNTER 1 - -#if USE_HW_COUNTER -DEFINE_HW_COUNTER() -#endif - - -#define EPS (128*DBL_EPSILON) - -static int not_same(double a, double b) { - return fabs(b-a)/(fabs(b)+fabs(a)) > 16*EPS; -} - -/* OLD CODE (reference implemenatoin ) =======================================*/ - -typedef double real; -#define cosr cos -#define fabsr fabs -#define PI 3.1415926535897932384626433832795028841971693993751058209749445923 - -/* precondition: n >= 0 */ -static real legendre(int n, real x) -{ - real p[2]; - int i; - p[0]=1, p[1]=x; - for(i=1; i 0 */ -static real legendre_d1(int n, real x) -{ - real p[2]; - int i; - p[0]=3*x, p[1]=1; - for(i=2; i 1 */ -static real legendre_d2(int n, real x) -{ - real p[2]; - int i; - p[0]=3, p[1]=15*x; - for(i=3; i-x*EPS); - z[i] = x - legendre(n,x)/legendre_d1(n,x); - } - if(n&1) z[n/2]=0; - for(j=(n+1)/2,i=n/2-1; j-x*EPS); - z[i] = x - legendre_d1(np,x)/legendre_d2(np,x); - } - if(n&1) z[n/2]=0; - for(j=(n+1)/2,i=n/2-1; jn; - for(i=0 ; id[i] = x-p->z[i]; - for(i=0 ; iu0[i+1] = p->d[i]*p->u0[i]; - for(i=n-1; i ; --i) p->v0[i-1] = p->d[i]*p->v0[i]; - for(i=0 ; iJ[i] = p->w[i]*p->u0[i]*p->v0[i]; -} - -static void ref_lagrange_1(ref_lagrange_data *p, real x) -{ - unsigned i, n=p->n; - for(i=0 ; id[i] = x-p->z[i]; - for(i=0 ; iu0[i+1] = p->d[i]*p->u0[i], - p->u1[i+1] = p->d[i]*p->u1[i] + p->u0[i]; - for(i=n-1; i ; --i) - p->v0[i-1] = p->d[i]*p->v0[i], - p->v1[i-1] = p->d[i]*p->v1[i] + p->v0[i]; - for(i=0 ; iJ[i] = p->w[i]*p->u0[i]*p->v0[i], - p->D[i] = p->w[i]*(p->u1[i]*p->v0[i]+p->u0[i]*p->v1[i]); -} - -static void ref_lagrange_2(ref_lagrange_data *p, real x) -{ - unsigned i,n=p->n; - for(i=0 ; id[i] = x-p->z[i]; - for(i=0 ; iu0[i+1]=p->d[i]*p->u0[i], - p->u1[i+1]=p->d[i]*p->u1[i]+p->u0[i], - p->u2[i+1]=p->d[i]*p->u2[i]+2*p->u1[i]; - for(i=n-1; i ; --i) - p->v0[i-1]=p->d[i]*p->v0[i], - p->v1[i-1]=p->d[i]*p->v1[i]+p->v0[i], - p->v2[i-1]=p->d[i]*p->v2[i]+2*p->v1[i]; - for(i=0 ; iJ [i]=p->w[i]*p->u0[i]*p->v0[i], - p->D [i]=p->w[i]*(p->u1[i]*p->v0[i]+p->u0[i]*p->v1[i]), - p->D2[i]=p->w[i]*(p->u2[i]*p->v0[i]+2*p->u1[i]*p->v1[i]+p->u0[i]*p->v2[i]); -} - -static void ref_lagrange_2u(ref_lagrange_data *p) -{ - unsigned i,n=p->n; - for(i=0 ; iu2[i+1]=p->d[i]*p->u2[i]+2*p->u1[i]; - for(i=n-1; i ; --i) - p->v2[i-1]=p->d[i]*p->v2[i]+2*p->v1[i]; - for(i=0 ; iD2[i]=p->w[i]*(p->u2[i]*p->v0[i]+2*p->u1[i]*p->v1[i]+p->u0[i]*p->v2[i]); -} - -static void ref_lagrange_setup(ref_lagrange_data *p, const real *z, unsigned n) -{ - unsigned i,j; - p->n=n, p->z=z; - p->w = tmalloc(real, 17*n); - p->d = p->w+n; - p->J = p->d+n, p->D = p->J+n, p->D2 = p->D+n; - p->u0=p->D2+n, p->v0=p->u0+n; - p->u1=p->v0+n, p->v1=p->u1+n; - p->u2=p->v1+n, p->v2=p->u2+n; - p->J_z0=p->v2+n, p->D_z0=p->J_z0+n, p->D2_z0=p->D_z0+n; - p->J_zn=p->D2_z0+n, p->D_zn=p->J_zn+n, p->D2_zn=p->D_zn+n; - for(i=0; iw[i] = 1/ww; - } - p->u0[0] = p->v0[n-1] = 1; - p->u1[0] = p->v1[n-1] = 0; - p->u2[0] = p->v2[n-1] = 0; - ref_lagrange_2(p,z[0 ]); memcpy(p->J_z0,p->J,3*n*sizeof(real)); - ref_lagrange_2(p,z[n-1]); memcpy(p->J_zn,p->J,3*n*sizeof(real)); -} - -void ref_lagrange_free(ref_lagrange_data *p) -{ - free(p->w); -} - -/* TEST CODE (compare new against reference) =================================*/ - -int main() -{ - uint i,n; - -#if USE_HW_COUNTER - { - int d; - unsigned long long tic, toc; - unsigned r; - #define TIME(t, repeat, what) do { \ - for(r=repeat;r;--r) { what; } \ - tic = getticks(); \ - for(r=repeat;r;--r) { what; } \ - toc = getticks(); \ - t = toc-tic; \ - } while(0) - - for(d=0;d<3;++d) for(n=1;n -#include - -#include "types.h" - -typedef uint Index; - -#define sort jl_sort -#define Value uint -#define Data sort_data -typedef struct { Value v; Index i; } Data; -#include "sort_imp.c" - -#undef sort -#undef Value -#undef Data - -#ifdef GLOBAL_INT -# define Value ulong -# define Data sort_data_long - typedef struct { Value v; Index i; } Data; -# define radix_count radix_count_long -# define radix_offsets radix_offsets_long -# define radix_zeros radix_zeros_long -# define radix_pass radix_pass_long -# define radix_sort radix_sort_long -# define radix_index_pass_b radix_index_pass_b_long -# define radix_index_pass_m radix_index_pass_m_long -# define radix_index_pass_e radix_index_pass_e_long -# define radix_index_pass_be radix_index_pass_be_long -# define radix_index_sort radix_index_sort_long -# define merge_sort merge_sort_long -# define merge_index_sort merge_index_sort_long -# define sort sort_long -# define index_sort index_sort_long -# include "sort_imp.c" -#endif - diff --git a/3rdParty/gslib.github/src/sort_imp.c b/3rdParty/gslib.github/src/sort_imp.c deleted file mode 100644 index 307d9c4da..000000000 --- a/3rdParty/gslib.github/src/sort_imp.c +++ /dev/null @@ -1,346 +0,0 @@ - -/* this file possibly included multiple times by sort.c - for sorting different integer sizes; - - look in sort.c for some controlling macro definitions, - like Value, Index, Data, and function names */ - -#ifdef Value - -/*------------------------------------------------------------------------------ - - Radix Sort - - stable; O(n+k) time and extra storage - where k = (digits in an int) * 2^(bits per digit) - (e.g. k = 4 * 256 = 1024 for 32-bit ints with 8-bit digits) - - brief description: - input sorted stably on each digit, starting with the least significant - counting sort is used for each digit: - a pass through the input counts the occurences of each digit value - on a second pass, each input has a known destination - - tricks: - all counting passes are combined into one - the counting pass also computes the inclusive bit-wise or of all inputs, - which is used to skip digit positions for which all inputs have zeros - - ----------------------------------------------------------------------------*/ - -#define DIGIT_BITS 8 -#define DIGIT_VALUES (1<i) count[i][val&DIGIT_MASK]++, val>>=DIGIT_BITS -#define COUNT_DIGIT_02(n,i) COUNT_DIGIT_01(n,i); COUNT_DIGIT_01(n,i+ 1) -#define COUNT_DIGIT_04(n,i) COUNT_DIGIT_02(n,i); COUNT_DIGIT_02(n,i+ 2) -#define COUNT_DIGIT_08(n,i) COUNT_DIGIT_04(n,i); COUNT_DIGIT_04(n,i+ 4) -#define COUNT_DIGIT_16(n,i) COUNT_DIGIT_08(n,i); COUNT_DIGIT_08(n,i+ 8) -#define COUNT_DIGIT_32(n,i) COUNT_DIGIT_16(n,i); COUNT_DIGIT_16(n,i+16) -#define COUNT_DIGIT_64(n,i) COUNT_DIGIT_32(n,i); COUNT_DIGIT_32(n,i+32) - -static Value radix_count(const Value *A, const Value *end, Index stride, - Index count[DIGITS][DIGIT_VALUES]) -{ - Value bitorkey = 0; - memset(count,0,COUNT_SIZE*sizeof(Index)); - do { - Value val=*A; - bitorkey|=val; - COUNT_DIGIT_64(DIGITS,0); - /* above macro expands to: - if(DIGITS> 0) count[ 0][val&DIGIT_MASK]++, val>>=DIGIT_BITS; - if(DIGITS> 1) count[ 1][val&DIGIT_MASK]++, val>>=DIGIT_BITS; - ... - if(DIGITS>63) count[63][val&DIGIT_MASK]++, val>>=DIGIT_BITS; - */ - } while(A+=stride,A!=end); - return bitorkey; -} - -#undef COUNT_DIGIT_01 -#undef COUNT_DIGIT_02 -#undef COUNT_DIGIT_04 -#undef COUNT_DIGIT_08 -#undef COUNT_DIGIT_16 -#undef COUNT_DIGIT_32 -#undef COUNT_DIGIT_64 - -static void radix_offsets(Index *c) -{ - Index sum=0, t, *ce=c+DIGIT_VALUES; - do t=*c, *c++ = sum, sum+=t; while(c!=ce); -} - -static unsigned radix_zeros(Value bitorkey, Index count[DIGITS][DIGIT_VALUES], - unsigned *shift, Index **offsets) -{ - unsigned digits=0, sh=0; Index *c = &count[0][0]; - do { - if(bitorkey&DIGIT_MASK) *shift++ = sh, *offsets++ = c, ++digits, - radix_offsets(c); - } while(bitorkey>>=DIGIT_BITS,sh+=DIGIT_BITS,c+=DIGIT_VALUES,sh!=VALUE_BITS); - return digits; -} - -static void radix_pass(const Value *A, const Value *end, Index stride, - unsigned sh, Index *off, Value *out) -{ - do out[off[(*A>>sh)&DIGIT_MASK]++] = *A; while(A+=stride,A!=end); -} - -static void radix_sort(const Value *A, Index n, Index stride, - Value *out, Value *work) -{ - Index count[DIGITS][DIGIT_VALUES]; - const Value *end = A+n*stride; - Value bitorkey = radix_count(A, end, stride, count); - unsigned shift[DIGITS]; Index *offsets[DIGITS]; - unsigned digits = radix_zeros(bitorkey,count,shift,offsets); - if(digits==0) { - memset(out,0,sizeof(Value)*n); - } else { - Value *src, *dst; unsigned d; - if((digits&1)==1) src=out,dst=work; - else dst=out,src=work; - radix_pass(A,end,stride,shift[0],offsets[0],src); - for(d=1;d!=digits;++d) { - Value *t; - radix_pass(src,src+n,1,shift[d],offsets[d],dst); - t=src,src=dst,dst=t; - } - } -} - -static void radix_index_pass_b(const Value *A, Index n, Index stride, - unsigned sh, Index *off, Data *out) -{ - Index i=0; - do { - Value v = *A; - Data *d = &out[off[(v>>sh)&DIGIT_MASK]++]; - d->v=v, d->i=i++; - } while(A+=stride,i!=n); -} - -static void radix_index_pass_m(const Data *src, const Data *end, - unsigned sh, Index *off, Data *out) -{ - do { - Data *d = &out[off[(src->v>>sh)&DIGIT_MASK]++]; - d->v=src->v,d->i=src->i; - } while(++src!=end); -} - -static void radix_index_pass_e(const Data *src, const Data *end, - unsigned sh, Index *off, - Index *out) -{ - do out[off[(src->v>>sh)&DIGIT_MASK]++]=src->i; while(++src!=end); -} - -static void radix_index_pass_be(const Value *A, Index n, Index stride, - unsigned sh, Index *off, Index *out) -{ - Index i=0; - do out[off[(*A>>sh)&DIGIT_MASK]++]=i++; while(A+=stride,i!=n); -} - -static void radix_index_sort(const Value *A, Index n, Index stride, - Index *idx, Data *work) -{ - Index count[DIGITS][DIGIT_VALUES]; - Value bitorkey = radix_count(A, A+n*stride, stride, count); - unsigned shift[DIGITS]; Index *offsets[DIGITS]; - unsigned digits = radix_zeros(bitorkey,count,shift,offsets); - if(digits==0) { - Index i=0; do *idx++=i++; while(i!=n); - } else if(digits==1) { - radix_index_pass_be(A,n,stride,shift[0],offsets[0],idx); - } else { - Data *src, *dst; unsigned d; - if((digits&1)==0) dst=work,src=dst+n; - else src=work,dst=src+n; - radix_index_pass_b(A,n,stride,shift[0],offsets[0],src); - for(d=1;d!=digits-1;++d) { - Data *t; - radix_index_pass_m(src,src+n,shift[d],offsets[d],dst); - t=src,src=dst,dst=t; - } - radix_index_pass_e(src,src+n,shift[d],offsets[d],idx); - } -} - -/*------------------------------------------------------------------------------ - - Merge Sort - - stable; O(n log n) time - - ----------------------------------------------------------------------------*/ - -static void merge_sort(const Value *A, Index n, Index stride, - Value *out, Value *work) -{ - Value *buf[2]={out,work}; - Index base=-n, odd=0, c=0, b=1; - for(;;) { - Value *p; - if((c&1)==0) { - base+=n, n+=(odd&1), c|=1, b^=1; - while(n>3) odd<<=1,odd|=(n&1),n>>=1,c<<=1,b^=1; - } else - base-=n-(odd&1),n<<=1,n-=(odd&1),odd>>=1,c>>=1; - if(c==0) break; - p = buf[b]+base; - if(n==2) { - Value v[2]; v[0]=*A,A+=stride,v[1]=*A,A+=stride; - if(v[1]>1, nb = (n+1)>>1; - const Value *ap = buf[b^1]+base, *ae = ap+na; - Value *bp = p+na, *be = bp+nb; - for(;;) { - if(*bp<*ap) { - *p++=*bp++; - if(bp!=be) continue; - do *p++=*ap++; while(ap!=ae); - break; - } else { - *p++=*ap++; - if(ap==ae) break; - } - } - } - } -} - -static void merge_index_sort(const Value *A, const Index An, Index stride, - Index *idx, Data *work) -{ - Data *buf[2]={work+An,work}; - Index n=An, base=-n, odd=0, c=0, b=1; - Index i=0; - for(;;) { - Data *p; - if((c&1)==0) { - base+=n, n+=(odd&1), c|=1, b^=1; - while(n>3) odd<<=1,odd|=(n&1),n>>=1,c<<=1,b^=1; - } else - base-=n-(odd&1),n<<=1,n-=(odd&1),odd>>=1,c>>=1; - if(c==0) break; - p = buf[b]+base; - if(n==2) { - Value v[2]; v[0]=*A,A+=stride,v[1]=*A,A+=stride; - if(v[1]>1, nb = (n+1)>>1; - const Data *ap = buf[b^1]+base, *ae = ap+na; - Data *bp = p+na, *be = bp+nb; - for(;;) { - if(bp->vv) { - *p++=*bp++; - if(bp!=be) continue; - do *p++=*ap++; while(ap!=ae); - break; - } else { - *p++=*ap++; - if(ap==ae) break; - } - } - } - } - { - const Data *p = buf[0], *pe = p+An; - do *idx++ = (p++)->i; while(p!=pe); - } -} - -/*------------------------------------------------------------------------------ - - Hybrid Stable Sort - - low-overhead merge sort when n is small, - otherwise asymptotically superior radix sort - - result = O(n) sort with good performance for all n - - A, n, stride : specifices the input - - sort: - Value out[n] : the sorted values (output) - Value work[n]: scratch area - - index_sort: - Index idx[n] : the sorted indices (output) - Data work[2*n]: scratch area - - ----------------------------------------------------------------------------*/ - -void sort(const Value *A, Index n, Index stride, Value *out, Value *work) -{ - if(n -#include -#include -#include -#include "c99.h" -#include "name.h" -#include "fail.h" -#include "types.h" -#include "mem.h" -#include "sort.h" - -#define sparse_cholesky_factor PREFIXED_NAME(sparse_cholesky_factor) -#define sparse_cholesky_solve PREFIXED_NAME(sparse_cholesky_solve ) -#define sparse_cholesky_free PREFIXED_NAME(sparse_cholesky_free ) - -/* factors: L is in CSR format - D is a diagonal matrix stored as a vector - actual factorization is: - - -1 T - A = (I-L) D (I-L) - - -1 -T -1 - A = (I-L) D (I-L) - - (triangular factor is unit diagonal; the diagonal is not stored) -*/ -struct sparse_cholesky { - uint n, *Lrp, *Lj; - double *L, *D; -}; - -/* - symbolic factorization: finds the sparsity structure of L - - uses the concept of elimination tree: - the parent of node j is node i when L(i,j) is the first - non-zero in column j below the diagonal (i>j) - L's structure is discovered row-by-row; the first time - an entry in column j is set, it must be the parent - - the nonzeros in L are the nonzeros in A + paths up the elimination tree - - linear in the number of nonzeros of L -*/ -static void factor_symbolic(uint n, const uint *Arp, const uint *Aj, - struct sparse_cholesky *out, buffer *buf) -{ - uint *visit = tmalloc(uint,2*n), *parent = visit+n; - uint *Lrp, *Lj; - uint i,nz=0; - - out->n=n; - - for(i=0;i=i) break; - for(;visit[j]!=i;j=parent[j]) { - ++nz, visit[j]=i; - if(parent[j]==n) { parent[j]=i; break; } - } - } - } - - Lrp=out->Lrp=tmalloc(uint,n+1+nz); - Lj =out->Lj =Lrp+n+1; - - Lrp[0]=0; - for(i=0;i=i) break; - for(;visit[j]!=i;j=parent[j]) Ljr[count++]=j, visit[j]=i; - } - sortv(Ljr, Ljr,count,sizeof(uint), buf); - Lrp[i+1]=Lrp[i]+count; - } - free(visit); -} - -/* - numeric factorization: - - L is built row-by-row, using: ( ' indicates transpose ) - - - [ A r ] = [ (I-L) ] [ D^(-1) ] [ (I-L)' -s ] - [ r' a ] [ -s' 1 ] [ 1/d ] [ 1 ] - - = [ A (I-L) D^(-1) (-s) ] - [ r' s' D^(-1) s + 1/d ] - - so, if r' is the next row of A, up to but excluding the diagonal, - then the next row of L, s', obeys - - r = - (I-L) D^(-1) s - - let y = (I-L)^(-1) (-r) - then s = D y, and d = 1/(s' y) - -*/ -static void factor_numeric(uint n, const uint *Arp, const uint *Aj, - const double *A, - struct sparse_cholesky *out, - uint *visit, double *y) -{ - const uint *Lrp=out->Lrp, *Lj=out->Lj; - double *D, *L; - uint i; - - D=out->D=tmalloc(double,n+Lrp[n]); - L=out->L=D+n; - - for(i=0;i=i) { if(j==i) a=A[p]; break; } - y[j]=-A[p]; - } - for(p=Lrp[i],pe=Lrp[i+1];p!=pe;++p) { - uint q,qe,j=Lj[p]; double lij,yj=y[j]; - for(q=Lrp[j],qe=Lrp[j+1];q!=qe;++q) { - uint k=Lj[q]; if(visit[k]==i) yj+=L[q]*y[k]; - } - y[j]=yj; - L[p]=lij=D[j]*yj; - a-=yj*lij; - } - D[i]=1/a; - } -} - -/* x = A^(-1) b; works when x and b alias */ -void sparse_cholesky_solve( - double *x, const struct sparse_cholesky *fac, double *b) -{ - const uint n=fac->n, *Lrp=fac->Lrp, *Lj=fac->Lj; - const double *L=fac->L, *D=fac->D; - uint i, p,pe; - for(i=0;iptr,n_uints_as_dbls+(double*)buf->ptr); -} - -void sparse_cholesky_free(struct sparse_cholesky *fac) -{ - free(fac->Lrp); fac->Lj=fac->Lrp=0; - free(fac->D); fac->L =fac->D =0; -} - diff --git a/3rdParty/gslib.github/src/sparse_cholesky.h b/3rdParty/gslib.github/src/sparse_cholesky.h deleted file mode 100644 index f31cd8a7e..000000000 --- a/3rdParty/gslib.github/src/sparse_cholesky.h +++ /dev/null @@ -1,34 +0,0 @@ -#ifndef SPARSE_CHOLESKY_H -#define SPARSE_CHOLESKY_H - -#if !defined(TYPES_H) || !defined(MEM_H) -#warning "sparse_cholesky.h" requires "types.h" and "mem.h" -#endif - -#define sparse_cholesky_factor PREFIXED_NAME(sparse_cholesky_factor) -#define sparse_cholesky_solve PREFIXED_NAME(sparse_cholesky_solve ) -#define sparse_cholesky_free PREFIXED_NAME(sparse_cholesky_free ) - -struct sparse_cholesky { - uint n, *Lrp, *Lj; - double *L, *D; -}; - -/* input data is the usual CSR - matrix is n by n - Arp has n+1 elements - elements of row i are A [Arp[i]], ..., A [Arp[i+1]-1] - in columns Aj[Arp[i]], ..., Aj[Arp[i+1]-1] -*/ -void sparse_cholesky_factor(uint n, const uint *Arp, const uint *Aj, - const double *A, - struct sparse_cholesky *out, buffer *buf); - -/* x = A^(-1) b; works when x and b alias */ -void sparse_cholesky_solve( - double *x, const struct sparse_cholesky *fac, double *b); - -void sparse_cholesky_free(struct sparse_cholesky *fac); - -#endif - diff --git a/3rdParty/gslib.github/src/sparse_cholesky2.c b/3rdParty/gslib.github/src/sparse_cholesky2.c deleted file mode 100644 index fc63661af..000000000 --- a/3rdParty/gslib.github/src/sparse_cholesky2.c +++ /dev/null @@ -1,172 +0,0 @@ -#include -#include -#include - -#include "types.h" -#include "minmax.h" -#include "errmem.h" -#include "sort.h" - -/* factors: L is in CSR format - D is a diagonal matrix stored as a vector - actual factorization is: - - -1 T - A = (I-L) D (I-L) - - -1 -T -1 - A = (I-L) D (I-L) - - (triangular factor is unit diagonal; the diagonal is not stored) -*/ -typedef struct { - uint n, *Lrp, *Lj; - real *L, *D; -} sparse_cholesky_data; - -/* - symbolic factorization: finds the sparsity structure of L - - uses the concept of elimination tree: - the parent of node j is node i when L(i,j) is the first - non-zero in column j below the diagonal (i>j) - L's structure is discovered row-by-row; the first time - an entry in column j is set, it must be the parent - - the nonzeros in L are the nonzeros in A + paths up the elimination tree - - linear in the number of nonzeros of L -*/ -static void factor_symbolic(uint n, const uint *Arp, const uint *Aj, - sparse_cholesky_data *out, uint *work) -{ - uint *visit, *parent, *sorted; - uint *Lrp, *Lj; - uint i,nz=0; - - out->n=n; - - /* sorted needs 2*n; work needs 4*n */ - visit = work, parent = visit+n, sorted=parent+n; - - for(i=0;i=i) break; - for(;visit[j]!=i;j=parent[j]) { - ++nz, visit[j]=i; - if(parent[j]==n) { parent[j]=i; break; } - } - } - } - - Lrp=out->Lrp=tmalloc(uint,n+1+nz); - Lj =out->Lj =Lrp+n+1; - - Lrp[0]=0; - for(i=0;i=i) break; - for(;visit[j]!=i;j=parent[j]) Ljr[count++]=j, visit[j]=i; - } - sort(Ljr,count,1,sorted,sorted+count); - memcpy(Ljr,sorted,count*sizeof(uint)); - Lrp[i+1]=Lrp[i]+count; - } -} - -/* - numeric factorization: - - L is built row-by-row, using: ( ' indicates transpose ) - - - [ A r ] = [ (I-L) ] [ D^(-1) ] [ (I-L)' -s ] - [ r' a ] [ -s' 1 ] [ 1/d ] [ 1 ] - - = [ A (I-L) D^(-1) (-s) ] - [ r' s' D^(-1) s + 1/d ] - - so, if r' is the next row of A, up to but excluding the diagonal, - then the next row of L, s', obeys - - r = - (I-L) D^(-1) s - - let y = (I-L)^(-1) (-r) - then s = D y, and d = 1/(s' y) - -*/ -static void factor_numeric(uint n, const uint *Arp, const uint *Aj, - const real *A, - sparse_cholesky_data *out, - uint *visit, real *y) -{ - const uint *Lrp=out->Lrp, *Lj=out->Lj; - real *D, *L; - uint i; - - D=out->D=tmalloc(real,n+Lrp[n]); - L=out->L=D+n; - - for(i=0;i=i) { if(j==i) a=A[p]; break; } - y[j]=-A[p]; - } - for(p=Lrp[i],pe=Lrp[i+1];p!=pe;++p) { - uint q,qe,j=Lj[p]; real lij,yj=y[j]; - for(q=Lrp[j],qe=Lrp[j+1];q!=qe;++q) { - uint k=Lj[q]; if(visit[k]==i) yj+=L[q]*y[k]; - } - y[j]=yj; - L[p]=lij=D[j]*yj; - a-=yj*lij; - } - D[i]=1/a; - } -} - -/* x = A^(-1) b; works when x and b alias */ -void sparse_cholesky_solve(real *x, const sparse_cholesky_data *fac, real *b) -{ - const uint n=fac->n, *Lrp=fac->Lrp, *Lj=fac->Lj; - const real *L=fac->L, *D=fac->D; - uint i, p,pe; - for(i=0;iptr); - factor_numeric(n,Arp,Aj,A,out,buf->ptr,n_uints_as_reals+(real*)buf->ptr); -} - -void sparse_cholesky_free(sparse_cholesky_data *fac) -{ - free(fac->Lrp); fac->Lj=fac->Lrp=0; - free(fac->D); fac->L =fac->D =0; -} - diff --git a/3rdParty/gslib.github/src/spchol_test.c b/3rdParty/gslib.github/src/spchol_test.c deleted file mode 100644 index 5fc6596c1..000000000 --- a/3rdParty/gslib.github/src/spchol_test.c +++ /dev/null @@ -1,54 +0,0 @@ -#include -#include -#include -#include -#include "c99.h" -#include "name.h" -#include "fail.h" -#include "types.h" -#include "mem.h" -#include "sparse_cholesky.h" - -int main() -{ -#define x -1 - - uint i,n=7; - uint Aj [] = {0,2, 1,2,6, 0,1,2, 3,5,6, 4,5, 3,4,5, 1,3,6}; - double A[] = {2,x, 2,x,x, x,x,2, 2,x,x, 2,x, x,x,2, x,x,2}; -#undef x - uint Arp[] = {0,2,5,8,11,13,16,19}; - double x[7], b[7] = {0,0,0,0, 0,0,0}; - uint o[7] = {0,2,1,6,3,5,4}; -/* - uint i,n=10; - uint Aj [] = {0,2,7, 1,4,9, 0,2,6, 3,8,9, 1,4,8,9, 5,6,7, 2,5,6, 0,5,7,8,9, 3,4,7,8, 1,3,4,7,9}; - real A [] = {3,x,x, 2,x,x, x,2,x, 2,x,x, x,3,x,x, 2,x,x, x,x,2, x,x,4,x,x, x,x,x,3, x,x,x,x,4}; -#undef x - uint Arp[] = {0, 3, 6, 9, 12, 16, 19, 22, 27, 31, 36}; - real b[] = {1,2,3,4,5, 6,7,8,9,10}; -*/ - struct sparse_cholesky data; - buffer buf; - buffer_init(&buf,4); - sparse_cholesky_factor(n,Arp,Aj,A,&data,&buf); - - for(i=0;i=1 */ -static real inner(const real *u, const real *v, unsigned n) -{ - const real *u_end = u+n; - real sum = *u++ * *v++; - while(u!=u_end) { sum += *u++ * *v++; } - return sum; -} - -/*-------------------------------------------------------------------------- - 1-,2-,3-d Tensor Application - - the 3d case: - tensor_f3(R,mr,nr, S,ms,ns, T,mt,nt, u,v, work1,work2) - gives v = [ R (x) S (x) T ] u - where R is mr x nr, S is ms x ns, T is mt x nt, - each in row- or column-major format according to f := r | c - u is nr x ns x nt in column-major format (inner index is r) - v is mr x ms x mt in column-major format (inner index is r) - --------------------------------------------------------------------------*/ - -void tensor_c1(const real *R, unsigned mr, unsigned nr, - const real *u, real *v) -{ - mxv_c(v,mr,R,u,nr); -} - -void tensor_r1(const real *R, unsigned mr, unsigned nr, - const real *u, real *v) -{ - mxv_r(v,mr,R,u,nr); -} - -/* W holds mr*ns reals */ -void tensor_c2(const real *R, unsigned mr, unsigned nr, - const real *S, unsigned ms, unsigned ns, - const real *u, real *v, real *W) -{ - mxm_cc(R,mr,u,nr,W,ns); - mxm_cr(W,mr,S,ns,v,ms); -} - -/* W holds mr*ns reals */ -void tensor_r2(const real *R, unsigned mr, unsigned nr, - const real *S, unsigned ms, unsigned ns, - const real *u, real *v, real *W) -{ - mxm_rc(R,mr,u,nr,W,ns); - mxm_cc(W,mr,S,ns,v,ms); -} - -/* W holds mr*ns*nt reals, - Z holds mr*ms*nt reals */ -void tensor_c3(const real *R, unsigned mr, unsigned nr, - const real *S, unsigned ms, unsigned ns, - const real *T, unsigned mt, unsigned nt, - const real *u, real *v, real *W, real *Z) -{ - unsigned n,mrns=mr*ns,mrms=mr*ms; - real *Zp = Z; - mxm_cc(R,mr,u,nr,W,ns*nt); - for(n=0;n -#include -#include -#include -#include -#include -#include "errmem.h" -#include "types.h" -#include "minmax.h" -#include "sort.h" -#include "tuple_list.h" -#include "crystal.h" - -#define UINT_PER_X(X) ((sizeof(X)+sizeof(uint)-1)/sizeof(uint)) -#define UINT_PER_REAL UINT_PER_X(real) -#define UINT_PER_LONG UINT_PER_X(slong) - -/*------------------------------------------------------------------------------ - - Transfer - - Treats one integer (not long) member of the tuple list as a target proc; - Sends out tuples accordingly, using the crystal router. - Target proc member overwritten with source proc. - - dynamic: non-zero if the tuple list should grow to accomodate arrivals - tl: the tuple list - pf: which tuple member specifies target proc - crystal: an initialized crystal router structure (cf. crystal.h) - - ----------------------------------------------------------------------------*/ - -void transfer(int dynamic, tuple_list *tl, - unsigned pf, crystal_data *crystal) -{ - const unsigned mi=tl->mi,ml=tl->ml,mr=tl->mr; - const unsigned tsize = (mi-1) + ml*UINT_PER_LONG + mr*UINT_PER_REAL; - sint p, lp = -1; - sint *ri; slong *rl; real *rr; - uint i, j, *buf, *len=0, *buf_end; - - /* sort to group by target proc */ - tuple_list_sort(tl,pf,&crystal->all->buf); - - /* pack into buffer for crystal router */ - buffer_reserve(&crystal->all->buf,(tl->n*(3+tsize))*sizeof(uint)); - crystal->all->n=0, buf = crystal->all->buf.ptr; - ri=tl->vi,rl=tl->vl,rr=tl->vr; - for(i=tl->n;i;--i) { - p = ri[pf]; - if(p!=lp) { - lp = p; - *buf++ = p; /* target */ - *buf++ = crystal->id; /* source */ - len = buf++; *len=0; /* length */ - crystal->all->n += 3; - } - for(j=0;jall->n += tsize; - } - - crystal_router(crystal); - - /* unpack */ - buf = crystal->all->buf.ptr, buf_end = buf + crystal->all->n; - tl->n = 0; - ri=tl->vi,rl=tl->vl,rr=tl->vr; - while(buf != buf_end) { - sint p, len; - buf++; /* target ( == this proc ) */ - p = *buf++; /* source */ - len = *buf++; /* length */ - while(len>0) { - if(tl->n==tl->max) { - if(!dynamic) { tl->n = tl->max + 1; return; } - tuple_list_grow(tl); - ri = tl->vi + mi*tl->n, rl = tl->vl + ml*tl->n, rr = tl->vr + mr*tl->n; - } - ++tl->n; - for(j=0;j -#include -#include -#include -#include "errmem.h" -#include "types.h" -#include "crystal.h" -#include "tuple_list.h" -#include "transfer.h" - -#define MI 3 -#define ML 2 -#define MR 3 -#define PF 0 - -int main(int narg, char *arg[]) -{ - tuple_list tl; - int id=0,np=1; uint i; - crystal_data crystal; - MPI_Comm comm; - MPI_Init(&narg,&arg); - MPI_Comm_dup(MPI_COMM_WORLD,&comm); - MPI_Comm_rank(comm,&id); - MPI_Comm_size(comm,&np); - - srand(id+1); - tuple_list_init_max(&tl,MI,ML,MR,np*3); - tl.n = np*3; - for(i=0;itl.max) tl.n=tl.max, printf("%d lost some\n", id); - - for(i=0;i -int main() -{ - printf("Not compiled with -DMPI. Test is meaningless.\n"); - return 0; -} - -#endif - diff --git a/3rdParty/gslib.github/src/tuple_list.c b/3rdParty/gslib.github/src/tuple_list.c deleted file mode 100644 index f18e829a9..000000000 --- a/3rdParty/gslib.github/src/tuple_list.c +++ /dev/null @@ -1,58 +0,0 @@ -#include -#include -#include -#include -#include -#include "errmem.h" -#include "types.h" -#include "minmax.h" -#include "sort.h" - -typedef struct { - unsigned mi,ml,mr; - uint n, max; - sint *vi; slong *vl; real *vr; -} tuple_list; - -void tuple_list_permute(tuple_list *tl, uint *perm, void *work) -{ - const unsigned mi=tl->mi, ml=tl->ml, mr=tl->mr; - const unsigned int_size = mi*sizeof(sint), - long_size = ml*sizeof(slong), - real_size = mr*sizeof(real); - if(mi) { - uint *p=perm, *pe=p+tl->n; char *sorted=work; - while(p!=pe) memcpy(sorted,&tl->vi[mi*(*p++)],int_size),sorted+=int_size; - memcpy(tl->vi,work,int_size*tl->n); - } - if(ml) { - uint *p=perm, *pe=p+tl->n; char *sorted=work; - while(p!=pe) memcpy(sorted,&tl->vl[ml*(*p++)],long_size),sorted+=long_size; - memcpy(tl->vl,work,long_size*tl->n); - } - if(mr) { - uint *p=perm, *pe=p+tl->n; char *sorted=work; - while(p!=pe) memcpy(sorted,&tl->vr[mr*(*p++)],real_size),sorted+=real_size; - memcpy(tl->vr,work,real_size*tl->n); - } -} - -void tuple_list_sort(tuple_list *tl, unsigned key, buffer *buf) -{ - const unsigned mi=tl->mi, ml=tl->ml, mr=tl->mr; - const unsigned int_size = mi*sizeof(sint); - const unsigned long_size = ml*sizeof(slong); - const unsigned real_size = mr*sizeof(real); - const unsigned width = umax_3(int_size,long_size,real_size); - const unsigned data_size = key>=mi ? sizeof(sort_data_long):sizeof(sort_data); - uint work_min=tl->n * umax_2(2*data_size,sizeof(sint)+width); - uint *work; - buffer_reserve(buf,work_min); - work = buf->ptr; - if(keyvi[key ],tl->n,mi, work, (void*)work); - else - index_sort_long((ulong*)&tl->vl[key-mi],tl->n,ml, work, (void*)work); - tuple_list_permute(tl,work,work+tl->n); -} - diff --git a/3rdParty/gslib.github/src/tuple_list.h b/3rdParty/gslib.github/src/tuple_list.h deleted file mode 100644 index 04aab5ec8..000000000 --- a/3rdParty/gslib.github/src/tuple_list.h +++ /dev/null @@ -1,84 +0,0 @@ -/*------------------------------------------------------------------------------ - - Tuple list definition and utilities - - Conceptually, a tuple list is a list of n records or tuples, - each with mi integers, ml longs, and mr reals - (these types are defined in "types.h" as sint, slong, real; - it may be that sint==slong) - - There are three arrays, one for each type (vi,vl,vr), - with records layed out contiguously within each array - - ----------------------------------------------------------------------------*/ - -#ifndef TUPLE_LIST_H -#define TUPLE_LIST_H - -/* requires "errmem.h" and "types.h" */ -#if !defined(ERRMEM_H) || !defined(TYPES_H) -#warning "tuple_list.h" requires "errmem.h" and "types.h" -#endif - -typedef struct { - unsigned mi,ml,mr; - uint n, max; - sint *vi; slong *vl; real *vr; -} tuple_list; - -/* storage layed out as: vi[max][mi], vl[max][ml], vr[max][mr] - where "tuple" i is given by (vi[i][0:mi-1],vl[i][0:ml-1],vr[i][0:mr-1]). - only the first n tuples are in use */ - -static void tuple_list_init(tuple_list *tl, - unsigned mi, unsigned ml, unsigned mr) -{ - tl->n=tl->max=0; - tl->mi=mi,tl->ml=ml,tl->mr=mr; - tl->vi=0,tl->vl=0,tl->vr=0; -} - -static void tuple_list_init_max(tuple_list *tl, - unsigned mi, unsigned ml, unsigned mr, uint max) -{ - tl->n=0; tl->max=max; - tl->mi=mi,tl->ml=ml,tl->mr=mr; - tl->vi=tmalloc(sint, max*mi); - tl->vl=tmalloc(slong,max*ml); - tl->vr=tmalloc(real, max*mr); -} - -static void tuple_list_free(tuple_list *tl) { - free(tl->vi), free(tl->vl), free(tl->vr); -} - -static void tuple_list_resize(tuple_list *tl, uint max) -{ - tl->max = max; - tl->vi=trealloc(sint, tl->vi,tl->max*tl->mi); - tl->vl=trealloc(slong,tl->vl,tl->max*tl->ml); - tl->vr=trealloc(real, tl->vr,tl->max*tl->mr); -} - -static void tuple_list_reserve(tuple_list *tl, uint min) -{ - uint max = tl->max; - if(maxmax+tl->max/2+1); -} - -void tuple_list_permute(tuple_list *tl, uint *perm, void *work); -/* sort tuples by the field specified by key -#include -#include -#include -#include -#include -#include -#include "c99.h" -#include "name.h" -#include "fail.h" -#include "types.h" -#include "tensor.h" -#include "gs_defs.h" -#include "comm.h" -#include "mem.h" -#include "sort.h" -#include "sarray_sort.h" -#include "sparse_cholesky.h" -#include "gs.h" - -#define crs_setup PREFIXED_NAME(crs_setup) -#define crs_solve PREFIXED_NAME(crs_solve) -#define crs_stats PREFIXED_NAME(crs_stats) -#define crs_free PREFIXED_NAME(crs_free ) - -/* - portable log base 2 - - does a binary search to find leading order bit - - UINT_BITS = number of bits in a uint - BITS(0) = UINT_BITS - BITS(i) = half of BITS(i-1), rounded up - MASK(i) = bitmask with BITS(i) 1's followed by BITS(i) 0's -*/ - -static unsigned lg(uint v) -{ - unsigned r = 0; -#define UINT_BITS (sizeof(uint)*CHAR_BIT) -#define BITS(i) ((UINT_BITS+(1<<(i))-1)>>(i)) -#define MASK(i) ((((uint)1<>=BITS(i), r+=BITS(i) - CHECK(1); CHECK(2); CHECK(3); CHECK(4); CHECK(5); CHECK(6); CHECK(7); - CHECK(8); CHECK(9); /* this covers up to 1024-bit uints */ - if(v&2) ++r; - return r; -#undef UINT_BITS -#undef BITS -#undef MASK -#undef CHECK -} - -struct csr_mat { - uint n, *Arp, *Aj; double *A; -}; - -struct xxt { - - /* communication */ - - struct comm comm; - uint pcoord; /* coordinate in communication tree */ - unsigned plevels; /* # of stages of communication */ - sint *pother; /* let p = pother[i], then during stage i of fan-in, - if p>=0, receive from p - if p< 0, send to (-p-1) - fan-out is just the reverse ... - on proc 0, pother is never negative - on others, pother is negative for the last stage only */ - comm_req *req; - - /* separators */ - - unsigned nsep; /* number of separators */ - uint *sep_size; /* # of dofs on each separator, - ordered from the bottom to the top of the tree: - separator 0 is the bottom-most one (dofs not shared) - separator nsep-1 is the root of the tree */ - - unsigned null_space; - double *share_weight; - - /* vector sizes */ - - uint un; /* user's vector size */ - - /* xxt_solve works with "condensed" vectors; - same dofs as in user's vectors, but no duplicates and no Dirichlet nodes, - and also ordered topologically (children before parents) according to the - separator tree */ - - uint cn; /* size of condensed vectors */ - sint *perm_u2c; /* permutation from user vector to condensed vector, - p=perm_u2c[i]; xu[i] = p=-1 ? 0 : xc[p]; */ - uint ln, sn; /* xc[0 ... ln-1] are not shared (ln=sep_size[0]) - xc[ln ... ln+sn-1] are shared - ln+sn = cn */ - - uint xn; /* # of columns of x = sum_i(sep_size[i]) - sep_size[0] */ - - /* data */ - struct sparse_cholesky fac_A_ll; - struct csr_mat A_sl; - uint *Xp; double *X; /* column i of X starts at X[Xp[i]] */ - - /* execution buffers */ - double *vl, *vc, *vx, *combuf; -}; - - -/* - for the binary communication tree, the procs are divided in half - at each level, with the second half always the larger - - e.g., for np = 13: - - +------13-------+ - | | - +---6---+ +---7---+ - | | | | - +-3-+ +-3-+ +-3-+ +-4-+ - 1 2 1 2 1 2 2 2 - 1^1 1^1 1^1 1^1 1^1 - - plevels is the number of levels in the tree - = np==1 ? 1 : ( lg(np-1)+2 ) - - labelling the nodes with proc id's gives this communication tree: - - +-------0-------+ - | | - +---0---+ +---6---+ - | | | | - +-0-+ +-3-+ +-6-+ +-9-+ - 0 1 3 4 6 7 9 b - 1^2 4^5 7^8 9^a b^c - - consider proc 7 (pid = 7); - pcoord gives the position of the leaf labelled 7: - Root Right Left Right Left -> RRLRL -> 11010 - so pcoord = 11010 binary - note the parent coordinate can be found by bit shifting right - (i.e. dividing by 2) -*/ - -/* sets: pcoord, nsep, plevels, pother, req */ -static void locate_proc(struct xxt *data) -{ - const uint id = data->comm.id; - uint n = data->comm.np, c=1, odd=0, base=0; - unsigned level=0; - while(n>1) { - ++level; - odd=(odd<<1)|(n&1); - c<<=1, n>>=1; - if(id>=base+n) c|=1, base+=n, n+=(odd&1); - } - data->pcoord=c; - data->nsep = level+1; - data->plevels = data->nsep-1; - data->pother = tmalloc(sint,data->plevels); - data->req = tmalloc(comm_req,data->plevels); - for(level=0;levelplevels;++level) { - if((c&1)==1) { - uint targ = id - (n-(odd&1)); - data->pother[level]=-(sint)(targ+1); - data->plevels = level+1; - break; - } else { - data->pother[level]=id+n; - c>>=1, n=(n<<1)+(odd&1), odd>>=1; - } - } -} - -/* the tuple list describing the condensed dofs: - [(separator level, share count, global id)] */ -struct dof { ulong id; uint level, count; }; - -/* determine the size of each separator; - sums the separator sizes following the fan-in, fan-out comm. pattern - uses the share-counts to avoid counting dofs more than once */ -/* sets: xn, sep_size, ln, sn */ -static void discover_sep_sizes(struct xxt *data, - struct array *dofa, buffer *buf) -{ - const unsigned ns=data->nsep, nl=data->plevels; - const uint n = dofa->n; - float *v, *recv; - unsigned i,lvl; uint j; - const struct dof *dof = dofa->ptr; - - buffer_reserve(buf,2*ns*sizeof(float)); - v=buf->ptr, recv=v+ns; - - for(i=0;ipother[lvl]; - unsigned s = ns-(lvl+1); - if(other<0) { - comm_send(&data->comm,v +lvl+1,s*sizeof(float),-other-1,s); - } else { - comm_recv(&data->comm,recv+lvl+1,s*sizeof(float),other,s); - for(i=lvl+1;ipother[--lvl]; - unsigned s = ns-(lvl+1); - if(other<0) - comm_recv(&data->comm,v+lvl+1,s*sizeof(float),-other-1,s); - else - comm_send(&data->comm,v+lvl+1,s*sizeof(float),other,s); - } - - data->xn=0; - data->sep_size = tmalloc(uint,ns); - for(i=0;isep_size[i]=s; - data->xn+=s; - } - data->ln=data->sep_size[0]; - data->sn=data->cn-data->ln; - data->xn-=data->ln; -} - -/* assuming [A,Aend) is sorted, - removes 0's and any duplicate entries, - returns new end */ -static ulong *unique_nonzero(ulong *A, ulong *Aend) -{ - if(Aend==A) return A; - else { - ulong *end = Aend-1, last=*end, *p=A,*q=A,v=0; - *end = 1; - while(*q==0) ++q; /* *q==0 => q!=end since *end==0 */ - *end = 0; - while(q!=end) { - v=*q++, *p++=v; - while(*q==v) ++q; /* *q==v => q!=end since *end==0 */ - } - if(last!=v) *p++=last; - return p; - } -} - -static void merge_sep_ids(struct xxt *data, ulong *sep_id, ulong *other, - ulong *work, unsigned s0, buffer *buf) -{ - const unsigned ns = data->nsep; - unsigned s; - ulong *p=sep_id, *q=other; - for(s=s0;ssep_size[s]; - memcpy(work ,p,size*sizeof(ulong)); - memcpy(work+size,q,size*sizeof(ulong)); - sortv_long(work, work,2*size,sizeof(ulong), buf); - end = unique_nonzero(work,work+2*size); - memcpy(p,work,(end-work)*sizeof(ulong)); - p+=size, q+=size; - } -} - -static void init_sep_ids(struct xxt *data, struct array *dofa, ulong *xid) -{ - const unsigned ns=data->nsep; - const uint n=data->cn, *sep_size=data->sep_size; - unsigned s=1; - uint i, size; - const struct dof *dof = dofa->ptr; - if(ns==1) return; - size=sep_size[s]; - for(i=data->ln;isep_size[s]; - } - *xid++ = dof[i].id, --size; - } - while(s!=ns) { - memset(xid,0,size*sizeof(ulong)); - xid+=size; - if(++s != ns) size=data->sep_size[s]; - } -} - -static void find_perm_x2c(uint ln, uint cn, const struct array *dofc, - uint xn, const ulong *xid, sint *perm) -{ - const struct dof *dof = dofc->ptr, *dof_end = dof+cn; - const ulong *xid_end = xid+xn; uint i=ln; - dof+=ln; - while(dof!=dof_end) { - ulong v=dof->id; - while(*xid!=v) ++xid, *perm++ = -1; - *perm++ = i++, ++dof, ++xid; - } - while(xid!=xid_end) ++xid, *perm++ = -1; -} - -/* sets: perm_x2c */ -static sint *discover_sep_ids(struct xxt *data, struct array *dofa, buffer *buf) -{ - const unsigned ns=data->nsep, nl=data->plevels; - const uint xn=data->xn, *sep_size=data->sep_size; - ulong *xid, *recv, *work, *p; - unsigned lvl; - uint size,ss; - sint *perm_x2c; - - size=0; for(lvl=1;lvlsize) size=sep_size[lvl]; - xid=tmalloc(ulong,2*xn+2*size), recv=xid+xn, work=recv+xn; - - init_sep_ids(data,dofa,xid); - - if(nl) { - /* fan-in */ - p=xid, size=xn; - for(lvl=0;lvlpother[lvl]; - if(other<0) { - comm_send(&data->comm,p ,size*sizeof(ulong),-other-1,size); - } else { - comm_recv(&data->comm,recv,size*sizeof(ulong),other,size); - merge_sep_ids(data,p,recv,work,lvl+1,buf); - } - ss=data->sep_size[lvl+1]; - if(ss>=size || lvl==nl-1) break; - p+=ss, size-=ss; - } - /* fan-out */ - for(;;) { - sint other = data->pother[lvl]; - if(other<0) - comm_recv(&data->comm,p,size*sizeof(ulong),-other-1,size); - else - comm_send(&data->comm,p,size*sizeof(ulong),other,size); - if(lvl==0) break; - ss=data->sep_size[lvl]; - p-=ss, size+=ss, --lvl; - } - } - - perm_x2c=tmalloc(sint,xn); - find_perm_x2c(data->ln,data->cn,dofa, xn,xid, perm_x2c); - free(xid); - - return perm_x2c; -} - -static void apply_QQt(struct xxt *data, double *v, uint n, uint tag) -{ - const unsigned nl=data->plevels; - double *p=v, *recv=data->combuf; - unsigned lvl, nsend=0; - uint size=n, ss; - - if(n==0 || nl==0) return; - - tag=tag*2+0; - /* fan-in */ - for(lvl=0;lvlpother[lvl]; - if(other<0) { - comm_send(&data->comm,p ,size*sizeof(double),-other-1,tag); - } else { - uint i; - comm_recv(&data->comm,recv,size*sizeof(double),other ,tag); - for(i=0;isep_size[lvl+1]; - if(ss>=size || lvl==nl-1) break; - p+=ss, size-=ss; - } - /* fan-out */ - for(;;) { - sint other = data->pother[lvl]; - if(other<0) { - comm_recv (&data->comm,p,size*sizeof(double),-other-1,tag); - } else { - comm_isend(&data->req[nsend++],&data->comm, - p,size*sizeof(double),other ,tag); - } - if(lvl==0) break; - ss=data->sep_size[lvl]; - p-=ss, size+=ss, --lvl; - } - if(nsend) comm_wait(data->req,nsend); -} - -static double sum(struct xxt *data, double v, uint n, uint tag) -{ - const unsigned nl=data->plevels; - double r; - unsigned lvl,nsend=0; - uint size=n, ss; - - tag=tag*2+1; - if(n==0 || nl==0) return v; - /* fan-in */ - for(lvl=0;lvlpother[lvl]; - if(other<0) { - comm_send(&data->comm,&v,sizeof(double),-other-1,tag); - } else { - comm_recv(&data->comm,&r,sizeof(double),other ,tag); - v+=r; - } - ss=data->sep_size[lvl+1]; - if(ss>=size || lvl==nl-1) break; - size-=ss; - } - /* fan-out */ - for(;;) { - sint other = data->pother[lvl]; - if(other<0) { - comm_recv (&data->comm,&v,sizeof(double),-other-1,tag); - } else { - comm_isend(&data->req[nsend++],&data->comm, - &v,sizeof(double),other ,tag); - } - if(lvl==0) break; - ss=data->sep_size[lvl]; - size+=ss, --lvl; - } - if(nsend) comm_wait(data->req,nsend); - return v; -} - -/* sorts an array of ids, removes 0's and duplicates; - just returns the permutation */ -static uint unique_ids(uint n, const ulong *id, sint *perm, buffer *buf) -{ - uint *p, i, un=0; ulong last=0; - p = sortp_long(buf,0, id,n,sizeof(ulong)); - for(i=0;in=0; - return un; -} - -/* given user's list of dofs (as id's) - uses gather-scatter to find share-count and separator # for each - outputs as a list, sorted topologically (children before parents) - according to the sep. tree (and without duplicates), - as well as the permutation to get there from the user's list */ -/* sets: un, cn, perm_u2c */ -static void discover_dofs( - struct xxt *data, uint n, const ulong *id, - struct array *dofa, buffer *buf, const struct comm *comm) -{ - const uint pcoord = data->pcoord, ns=data->nsep; - sint *perm; - uint i, cn, *p, *pi; - ulong *bid; - struct gs_data *gsh; sint *v; - struct dof *dof; - - data->un = n; - data->perm_u2c = perm = tmalloc(sint,n); - data->cn = cn = unique_ids(n,id,perm,buf); - array_init(struct dof,dofa,cn), dofa->n=cn, dof=dofa->ptr; - buffer_reserve(buf,cn*sizeof(ulong)), bid=buf->ptr; - for(i=0;i=0) bid[perm[i]]=dof[perm[i]].id=id[i]; - - gsh = gs_setup((const slong*)bid,cn,comm,0,gs_crystal_router,0); - v = tmalloc(sint,cn); - - for(i=0;i=0) perm[i]=pi[perm[i]]; - sarray_permute_buf(struct dof,dof,cn, buf); -} - -/* vl += A_ls * vs */ -static void apply_p_Als(double *vl, struct xxt *data, const double *vs, uint ns) -{ - const uint *Arp = data->A_sl.Arp, - *Aj = data->A_sl.Aj; - const double *A = data->A_sl.A; - uint i,p,pe; - for(i=0;iA_sl.Arp, - *Aj = data->A_sl.Aj; - const double *A = data->A_sl.A; - uint i,p,pe; - for(i=0;iln; - const uint *Asl_rp = data->A_sl.Arp, *Ass_rp = A_ss->Arp, - *Asl_j = data->A_sl.Aj, *Ass_j = A_ss->Aj; - const double *Asl = data->A_sl.A, *Ass = A_ss->A; - uint i,p,pe; - for(i=0;i=ei) break; - vs[j]=-Ass[p]; - } - for(i=0;ifac_A_ll,vl); - apply_m_Asl(vs,ei,data,vl); -} - -static void apply_S(double *Svs, uint ns, struct xxt *data, - struct csr_mat *A_ss, const double *vs, double *vl) -{ - const uint ln=data->ln; - const uint *Ass_rp = A_ss->Arp, - *Ass_j = A_ss->Aj; - const double *Ass = A_ss->A; - uint i, p,pe; - for(i=0;i=ns) break; - sum+=Ass[p]*vs[j]; - } - Svs[i]=sum; - } - for(i=0;ifac_A_ll,vl); - apply_m_Asl(Svs,ns,data,vl); -} - -/* vx = X' * vs */ -static void apply_Xt(double *vx, uint nx, const struct xxt *data, - const double *vs) -{ - const double *X = data->X; const uint *Xp = data->Xp; - uint i; for(i=0;iX; const uint *Xp = data->Xp; - uint i,j; - for(i=0;ixn; - uint i,h=0; - if(data->null_space && xn) --xn; - data->Xp = tmalloc(uint,xn+1); - data->Xp[0]=0; - for(i=0;iXp[i+1]=data->Xp[i]+h; - } - data->X = tmalloc(double,data->Xp[xn]); -} - -static void orthogonalize(struct xxt *data, struct csr_mat *A_ss, - sint *perm_x2c, buffer *buf) -{ - uint ln=data->ln, sn=data->sn, xn=data->xn; - double *vl, *vs, *vx, *Svs; - uint i,j; - - allocate_X(data,perm_x2c); - - buffer_reserve(buf,(ln+2*sn+xn)*sizeof(double)); - vl=buf->ptr, vs=vl+ln, Svs=vs+sn, vx=Svs+sn; - - if(data->null_space && xn) --xn; - for(i=0;iXp[i+1]-data->Xp[i]; - sint ui = perm_x2c[i]; - double ytsy, *x; - - if(ui == -1) { - for(j=0;jX[data->Xp[i]]; - for(j=0;jn; - struct yale_mat *p, *q; - sarray_sort_2(struct yale_mat,mat->ptr,mat->n, i,0, j,0, buf); - - p = mat->ptr; - for(k=0;k+1i,j=p->j; - q = p+1; - for(;ki&&j==q->j) p->v += q->v, --mat->n; - else ++p, p->i=i=q->i,p->j=j=q->j, p->v=q->v; - } - } - - nz=mat->n; - out->n=nr; - out->Arp = tmalloc(uint,nr+1+mat->n); - out->Aj = out->Arp+nr+1; - out->A = tmalloc(double,mat->n); - for(k=0;kArp[k]=0; - for(p=mat->ptr,k=0;kArp[p->i]++, out->Aj[k]=p->j, out->A[k]=p->v; - nz=0; for(k=0;k<=nr;++k) { uint t=out->Arp[k]; out->Arp[k]=nz, nz+=t; } -} - -static void separate_matrix( - uint nz, const uint *Ai, const uint *Aj, const double *A, - const sint *perm, uint ln, uint sn, - struct csr_mat *out_ll, struct csr_mat *out_sl, struct csr_mat *out_ss, - buffer *buf -) -{ - uint k,n; - struct array mat_ll, mat_sl, mat_ss; - struct yale_mat *mll, *msl, *mss; - array_init(struct yale_mat,&mat_ll,2*nz), mll=mat_ll.ptr; - array_init(struct yale_mat,&mat_sl,2*nz), msl=mat_sl.ptr; - array_init(struct yale_mat,&mat_ss,2*nz), mss=mat_ss.ptr; - for(k=0;kcomm,comm); - - locate_proc(data); - - data->null_space=null_space; - - buffer_init(&buf,1024); - - discover_dofs(data,n,id,&dofa,&buf,&data->comm); - discover_sep_sizes(data,&dofa,&buf); - - perm_x2c = discover_sep_ids(data,&dofa,&buf); - if(data->null_space) { - uint i; double count = 0; struct dof *dof = dofa.ptr; - for(i=0;icn;++i) count+=1/(double)dof[i].count; - count=1/sum(data,count,data->xn,0); - data->share_weight=tmalloc(double,data->cn); - for(i=0;icn;++i) - data->share_weight[i]=count/dof[i].count; - } - array_free(&dofa); - - if(!data->null_space || data->xn!=0) { - separate_matrix(nz,Ai,Aj,A,data->perm_u2c, - data->ln,data->sn, - &A_ll,&data->A_sl,&A_ss, - &buf); - } else { - separate_matrix(nz,Ai,Aj,A,data->perm_u2c, - data->ln-1,1, - &A_ll,&data->A_sl,&A_ss, - &buf); - } - - sparse_cholesky_factor(A_ll.n,A_ll.Arp,A_ll.Aj,A_ll.A, - &data->fac_A_ll, &buf); - free(A_ll.Arp); free(A_ll.A); - - data->vl = tmalloc(double,data->ln+data->cn+2*data->xn); - data->vc = data->vl+data->ln; - data->vx = data->vc+data->cn; - data->combuf = data->vx+data->xn; - - orthogonalize(data,&A_ss,perm_x2c,&buf); - free(A_ss.Arp); free(A_ss.A); - free(perm_x2c); - buffer_free(&buf); - - return data; -} - -void crs_solve(double *x, struct xxt *data, const double *b) -{ - uint cn=data->cn, un=data->un, ln=data->ln, sn=data->sn, xn=data->xn; - double *vl=data->vl, *vc=data->vc, *vx=data->vx; - uint i; - for(i=0;iperm_u2c[i]; - if(p>=0) vc[p]+=b[i]; - } - if(xn>0 && (!data->null_space || xn>1)) { - if(data->null_space) --xn; - sparse_cholesky_solve(vc,&data->fac_A_ll,vc); - apply_m_Asl(vc+ln,sn, data, vc); - apply_Xt(vx,xn, data, vc+ln); - apply_QQt(data,vx,xn,0); - apply_X(vc+ln,sn, data, vx,xn); - for(i=0;ifac_A_ll,vl); - for(i=0;ifac_A_ll,vc); - if(data->null_space) { - if(xn==0) vc[ln-1]=0; - else if(sn==1) vc[ln]=0; - } - } - if(data->null_space) { - double s=0; - for(i=0;ishare_weight[i]*vc[i]; - s = sum(data,s,data->xn,0); - for(i=0;iperm_u2c[i]; - x[i] = p>=0 ? vc[p] : 0; - } -} - -void crs_stats(struct xxt *data) -{ - int a,b; uint xcol; - if(data->comm.id==0) { - unsigned s; - printf("xxt: separator sizes on %d =",(int)data->comm.id); - for(s=0;snsep;++s) printf(" %d",(int)data->sep_size[s]); - printf("\n"); - printf("xxt: shared dofs on %d = %d\n",(int)data->comm.id,(int)data->sn); - } - a=data->ln; - comm_allreduce(&data->comm,gs_int,gs_max, &a,1, &b); - if(data->comm.id==0) printf("xxt: max non-shared dofs = %d\n",a); - a=data->sn; - comm_allreduce(&data->comm,gs_int,gs_max, &a,1, &b); - if(data->comm.id==0) printf("xxt: max shared dofs = %d\n",a); - xcol=data->xn; if(xcol&&data->null_space) --xcol; - a=xcol; - comm_allreduce(&data->comm,gs_int,gs_max, &a,1, &b); - if(data->comm.id==0) printf("xxt: max X cols = %d\n",a); - a=data->Xp[xcol]*sizeof(double); - comm_allreduce(&data->comm,gs_int,gs_max, &a,1, &b); - if(data->comm.id==0) printf("xxt: max X size = %d bytes\n",a); -} - -void crs_free(struct xxt *data) -{ - comm_free(&data->comm); - free(data->pother); - free(data->req); - free(data->sep_size); - free(data->perm_u2c); - if(data->null_space) free(data->share_weight); - sparse_cholesky_free(&data->fac_A_ll); - free(data->A_sl.Arp); free(data->A_sl.A); - free(data->Xp); free(data->X); - free(data->vl); - free(data); -} - diff --git a/3rdParty/gslib.github/src/xxt.h b/3rdParty/gslib.github/src/xxt.h deleted file mode 100644 index 915854553..000000000 --- a/3rdParty/gslib.github/src/xxt.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef XXT_H -#define XXT_H - -/* requires "types.h", and, when MPI is defined, "crystal.h" */ -#if !defined(TYPES_H) || ( defined(MPI) && !defined(CRYSTAL_H) ) -#warning "xxt.h" requires "types.h" and "crystal.h" -#endif - -typedef struct xxt_data_ xxt_data; - -#ifndef MPI -# define crystal_data void -#endif - -#define xxt_free xxt_jl_free -#define xxt_solve xxt_jl_solve -#define xxt_stats xxt_jl_stats - -xxt_data *xxt_setup(uint n, const ulong *id, - uint nz, const uint *Ai, const uint *Aj, const real *A, - uint null_space, crystal_data *crystal); -void xxt_solve(real *x, xxt_data *data, const real *b); -void xxt_stats(xxt_data *data); -void xxt_free(xxt_data *data); - -#ifndef MPI -# undef crystal_data -#endif - -#endif - diff --git a/3rdParty/gslib.github/src/xxt2.c b/3rdParty/gslib.github/src/xxt2.c deleted file mode 100644 index cd4413385..000000000 --- a/3rdParty/gslib.github/src/xxt2.c +++ /dev/null @@ -1,1097 +0,0 @@ -#include -#include -#include -#include -#ifdef MPI -# include -# include -# include -#endif - -#include "errmem.h" -#include "types.h" -#include "sort.h" -#include "sparse_cholesky.h" -#ifdef MPI -# include "minmax.h" -# include "crystal.h" -# include "gs.h" -#endif -#include "tuple_list.h" - -#define xxt_free xxt_jl_free -#define xxt_solve xxt_jl_solve -#define xxt_stats xxt_jl_stats - -/* - portable log base 2 - - does a binary search to find leading order bit - - UINT_BITS = number of bits in a uint - BITS(0) = UINT_BITS - BITS(i) = half of BITS(i-1), rounded up - MASK(i) = bitmask with BITS(i) 1's followed by BITS(i) 0's -*/ - -static unsigned lg(uint v) -{ - unsigned r = 0; -#define UINT_BITS (sizeof(uint)*CHAR_BIT) -#define BITS(i) ((UINT_BITS+(1<<(i))-1)>>(i)) -#define MASK(i) ((((uint)1<>=BITS(i), r+=BITS(i) - CHECK(1); CHECK(2); CHECK(3); CHECK(4); CHECK(5); CHECK(6); CHECK(7); - CHECK(8); CHECK(9); /* this covers up to 1024-bit uints */ - if(v&2) ++r; - return r; -#undef UINT_BITS -#undef BITS -#undef MASK -#undef CHECK -} - -typedef struct { - uint n, *Arp, *Aj; real *A; -} csr_mat; - -#ifdef MPI - -typedef struct { - - /* communication */ - - MPI_Comm comm; - uint pid, np; /* proc id, number of procs */ - uint pcoord; /* coordinate in communication tree */ - unsigned plevels; /* # of stages of communication */ - sint *pother; /* let p = pother[i], then during stage i of fan-in, - if p>=0, receive from p - if p< 0, send to (-p-1) - fan-out is just the reverse ... - on proc 0, pother is never negative - on others, pother is negative for the last stage only */ - MPI_Request *mpireq; - MPI_Status *mpistatus; - - /* separators */ - - unsigned nsep; /* number of separators */ - uint *sep_size; /* # of dofs on each separator, - ordered from the bottom to the top of the tree: - separator 0 is the bottom-most one (dofs not shared) - separator nsep-1 is the root of the tree */ - - unsigned null_space; - real *share_weight; - - /* vector sizes */ - - uint un; /* user's vector size */ - - /* xxt_solve works with "condensed" vectors; - same dofs as in user's vectors, but no duplicates and no Dirichlet nodes, - and also ordered topologically (children before parents) according to the - separator tree */ - - uint cn; /* size of condensed vectors */ - sint *perm_u2c; /* permutation from user vector to condensed vector, - p=perm_u2c[i]; xu[i] = p=-1 ? 0 : xc[p]; */ - uint ln, sn; /* xc[0 ... ln-1] are not shared (ln=sep_size[0]) - xc[ln ... ln+sn-1] are shared - ln+sn = cn */ - - uint xn; /* # of columns of x = sum_i(sep_size[i]) - sep_size[0] */ - - /* data */ - sparse_cholesky_data fac_A_ll; - csr_mat A_sl; - uint *Xp; real *X; /* column i of X starts at X[Xp[i]] */ - - /* execution buffers */ - real *vl, *vc, *vx, *combuf; -} xxt_data; - -#else - -typedef struct { - unsigned null_space; - uint un, cn; - sint *perm_u2c; - sparse_cholesky_data fac_A_ll; - real *vc; -} xxt_data; - -#endif - -#ifdef MPI - -/* - for the binary communication tree, the procs are divided in half - at each level, with the second half always the larger - - e.g., for np = 13: - - +------13-------+ - | | - +---6---+ +---7---+ - | | | | - +-3-+ +-3-+ +-3-+ +-4-+ - 1 2 1 2 1 2 2 2 - 1^1 1^1 1^1 1^1 1^1 - - plevels is the number of levels in the tree - = np==1 ? 1 : ( lg(np-1)+2 ) - - labelling the nodes with proc id's gives this communication tree: - - +-------0-------+ - | | - +---0---+ +---6---+ - | | | | - +-0-+ +-3-+ +-6-+ +-9-+ - 0 1 3 4 6 7 9 b - 1^2 4^5 7^8 9^a b^c - - consider proc 7 (pid = 7); - pcoord gives the position of the leaf labelled 7: - Root Right Left Right Left -> RRLRL -> 11010 - so pcoord = 11010 binary - note the parent coordinate can be found by bit shifting right - (i.e. dividing by 2) -*/ - -/* sets: pcoord, nsep, plevels, pother, mpireq, mpistatus */ -static void locate_proc(xxt_data *data) -{ - const uint id = data->pid; - uint n = data->np, c=1, odd=0, base=0; - unsigned level=0; - while(n>1) { - ++level; - odd=(odd<<1)|(n&1); - c<<=1, n>>=1; - if(id>=base+n) c|=1, base+=n, n+=(odd&1); - } - data->pcoord=c; - data->nsep = level+1; - data->plevels = data->nsep-1; - data->pother = tmalloc(sint,data->plevels); - data->mpireq = tmalloc(MPI_Request,data->plevels); - data->mpistatus = tmalloc(MPI_Status ,data->plevels); - for(level=0;levelplevels;++level) { - if((c&1)==1) { - uint targ = id - (n-(odd&1)); - data->pother[level]=-(sint)(targ+1); - data->plevels = level+1; - break; - } else { - data->pother[level]=id+n; - c>>=1, n=(n<<1)+(odd&1), odd>>=1; - } - } -} - -/* the tuple list describing the condensed dofs: - [(separator level, share count, global id)] */ -static const unsigned dof_mi=2, dof_ml=1; -static const unsigned dof_level=0, dof_count=1, dof_id=2; - -/* determine the size of each separator; - sums the separator sizes following the fan-in, fan-out comm. pattern - uses the share-counts to avoid counting dofs more than once */ -/* sets: xn, sep_size, ln, sn */ -static void discover_sep_sizes(xxt_data *data, tuple_list *dof, buffer *buf) -{ - const unsigned ns=data->nsep, nl=data->plevels; - const uint n = dof->n; - float *v, *recv; - unsigned i,lvl; uint j; - MPI_Status status; - - buffer_reserve(buf,2*ns*sizeof(float)); - v=buf->ptr, recv=v+ns; - - for(i=0;ivi[dof_mi*j+dof_level]] - +=1/(float)dof->vi[dof_mi*j+dof_count]; - - /* fan-in */ - for(lvl=0;lvlpother[lvl]; - unsigned s = ns-(lvl+1); - if(other<0) { - MPI_Send(v +lvl+1,s,MPI_FLOAT,-other-1,s,data->comm); - } else { - MPI_Recv(recv+lvl+1,s,MPI_FLOAT,other,s,data->comm,&status); - for(i=lvl+1;ipother[--lvl]; - unsigned s = ns-(lvl+1); - if(other<0) - MPI_Recv(v+lvl+1,s,MPI_FLOAT,-other-1,s,data->comm,&status); - else - MPI_Send(v+lvl+1,s,MPI_FLOAT,other,s,data->comm); - } - - data->xn=0; - data->sep_size = tmalloc(uint,ns); - for(i=0;isep_size[i]=s; - data->xn+=s; - } - data->ln=data->sep_size[0]; - data->sn=data->cn-data->ln; - data->xn-=data->ln; -} - -/* assuming [A,Aend) is sorted, - removes 0's and any duplicate entries, - returns new end */ -static ulong *unique_nonzero(ulong *A, ulong *Aend) -{ - if(Aend==A) return A; - else { - ulong *end = Aend-1, last=*end, *p=A,*q=A,v=0; - *end = 1; - while(*q==0) ++q; /* *q==0 => q!=end since *end==0 */ - *end = 0; - while(q!=end) { - v=*q++, *p++=v; - while(*q==v) ++q; /* *q==v => q!=end since *end==0 */ - } - if(last!=v) *p++=last; - return p; - } -} - -static void merge_sep_ids(xxt_data *data, ulong *sep_id, ulong *other, - ulong *work, unsigned s0) -{ - const unsigned ns = data->nsep; - unsigned s; - ulong *p=sep_id, *q=other; - for(s=s0;ssep_size[s]; - memcpy(work+2*size,p,size*sizeof(ulong)); - memcpy(work+3*size,q,size*sizeof(ulong)); - sort_long(work+2*size,2*size,1, work, work+4*size); - end = unique_nonzero(work,work+2*size); - memcpy(p,work,(end-work)*sizeof(ulong)); - p+=size, q+=size; - } -} - -static void init_sep_ids(xxt_data *data, tuple_list *dof, ulong *xid) -{ - const unsigned ns=data->nsep; - const uint n=data->cn, *sep_size=data->sep_size; - unsigned s=1; - uint i, size; - if(ns==1) return; - size=sep_size[s]; - for(i=data->ln;ivi[dof_mi*i+dof_level]; - while(s!=si) { - memset(xid,0,size*sizeof(ulong)); - xid+=size; - size=data->sep_size[++s]; - } - *xid++ = dof->vl[i], --size; - } - while(s!=ns) { - memset(xid,0,size*sizeof(ulong)); - xid+=size; - size=data->sep_size[++s]; - } -} - -static void find_perm_x2c(uint ln, uint cn, const ulong *cid, - uint xn, const ulong *xid, sint *perm) -{ - const ulong *cid_end = cid+cn, *xid_end = xid+xn; uint i=ln; - cid+=ln; - while(cid!=cid_end) { - ulong v=*cid; - while(*xid!=v) ++xid, *perm++ = -1; - *perm++ = i++, ++cid, ++xid; - } - while(xid!=xid_end) ++xid, *perm++ = -1; -} - -/* sets: perm_x2c */ -static sint* discover_sep_ids(xxt_data *data, tuple_list *dof, buffer *buf) -{ - const unsigned ns=data->nsep, nl=data->plevels; - const uint xn=data->xn, *sep_size=data->sep_size; - ulong *xid, *recv, *work, *p; - unsigned lvl; - uint size,ss; - MPI_Status status; - sint *perm_x2c; - - size=0; for(lvl=1;lvlsize) size=sep_size[lvl]; - buffer_reserve(buf,(2*xn+6*size)*sizeof(ulong)); - xid=buf->ptr, recv=xid+xn, work=recv+xn; - - init_sep_ids(data,dof,xid); - - if(nl) { - /* fan-in */ - p=xid, size=xn; - for(lvl=0;lvlpother[lvl]; - if(other<0) { - MPI_Send(p ,size*sizeof(ulong),MPI_UNSIGNED_CHAR, - -other-1,size,data->comm); - } else { - MPI_Recv(recv,size*sizeof(ulong),MPI_UNSIGNED_CHAR, - other,size,data->comm,&status); - merge_sep_ids(data,p,recv,work,lvl+1); - } - ss=data->sep_size[lvl+1]; - if(ss>=size || lvl==nl-1) break; - p+=ss, size-=ss; - } - /* fan-out */ - for(;;) { - sint other = data->pother[lvl]; - if(other<0) - MPI_Recv(p,size*sizeof(ulong),MPI_UNSIGNED_CHAR, - -other-1,size,data->comm,&status); - else - MPI_Send(p,size*sizeof(ulong),MPI_UNSIGNED_CHAR, - other,size,data->comm); - if(lvl==0) break; - ss=data->sep_size[lvl]; - p-=ss, size+=ss, --lvl; - } - } - -#if 0 - printf("xid%d:",data->pid); - { uint i; for(i=0;iln,data->cn,(ulong*)dof->vl, xn,xid, perm_x2c); - -#if 0 - printf("x2c %d:",data->pid); - { uint i; for(i=0;iplevels; - real *p=v, *recv=data->combuf; - unsigned lvl, nsend=0; - uint size=n, ss; - MPI_Status status; - - if(n==0 || nl==0) return; - - tag=tag*2+0; - /* fan-in */ - for(lvl=0;lvlpother[lvl]; - if(other<0) { - MPI_Send(p ,size*sizeof(real),MPI_UNSIGNED_CHAR, - -other-1,tag,data->comm); - } else { - uint i; - MPI_Recv(recv,size*sizeof(real),MPI_UNSIGNED_CHAR, - other ,tag,data->comm,&status); - for(i=0;isep_size[lvl+1]; - if(ss>=size || lvl==nl-1) break; - p+=ss, size-=ss; - } - /* fan-out */ - for(;;) { - sint other = data->pother[lvl]; - if(other<0) { - MPI_Recv (p,size*sizeof(real),MPI_UNSIGNED_CHAR, - -other-1,tag,data->comm,&status); - } else { - MPI_Isend(p,size*sizeof(real),MPI_UNSIGNED_CHAR, - other ,tag,data->comm,&data->mpireq[nsend++]); - } - if(lvl==0) break; - ss=data->sep_size[lvl]; - p-=ss, size+=ss, --lvl; - } - if(nsend) MPI_Waitall(nsend,data->mpireq,data->mpistatus); -} - -static real sum(xxt_data *data, real v, uint n, uint tag) -{ - const unsigned nl=data->plevels; - real r; - unsigned lvl,nsend=0; - uint size=n, ss; - MPI_Status status; - - tag=tag*2+1; - if(n==0 || nl==0) return v; - /* fan-in */ - for(lvl=0;lvlpother[lvl]; - if(other<0) { - MPI_Send(&v,sizeof(real),MPI_UNSIGNED_CHAR, - -other-1,tag,data->comm); - } else { - MPI_Recv(&r,sizeof(real),MPI_UNSIGNED_CHAR, - other ,tag,data->comm,&status); - v+=r; - } - ss=data->sep_size[lvl+1]; - if(ss>=size || lvl==nl-1) break; - size-=ss; - } - /* fan-out */ - for(;;) { - sint other = data->pother[lvl]; - if(other<0) { - MPI_Recv (&v,sizeof(real),MPI_UNSIGNED_CHAR, - -other-1,tag,data->comm,&status); - } else { - MPI_Isend(&v,sizeof(real),MPI_UNSIGNED_CHAR, - other ,tag,data->comm,&data->mpireq[nsend++]); - } - if(lvl==0) break; - ss=data->sep_size[lvl]; - size+=ss, --lvl; - } - if(nsend) MPI_Waitall(nsend,data->mpireq,data->mpistatus); - return v; -} - -#endif - -/* sorts an array of ids, removes 0's and duplicates; - just returns the permutation */ -static uint unique_ids(uint n, const ulong *id, sint *perm, buffer *buf) -{ - uint *p, i, un=0; ulong last=0; - buffer_reserve(buf,2*n*sizeof(sort_data_long)); - p = buf->ptr; - index_sort_long(id,n,1, p, buf->ptr); - for(i=0;ipcoord, ns=data->nsep; - sint *perm; - uint i, cn, *p, *pi; - gs_data *gs; real *v; - buffer *buf=&crystal->all->buf; - - data->un = n; - data->perm_u2c = perm = tmalloc(sint,n); - data->cn = cn = unique_ids(n,id,perm,buf); - tuple_list_init_max(dof,dof_mi,dof_ml,0,cn); dof->n=cn; - for(i=0;i=0) dof->vl[perm[i]]=id[i]; - - gs = gs_data_setup(cn,(ulong*)dof->vl,1,crystal); - - buf = &crystal->all->buf; - buffer_reserve(buf,cn*sizeof(real)); - v = buf->ptr; - - for(i=0;ivi[dof_mi*i+dof_level]=ns-1-lg((uint)v[i]); - - for(i=0;ivi[dof_mi*i+dof_count]=v[i]+.1; - - gs_data_free(gs); - - buf = &crystal->all->buf; - buffer_reserve(buf,umax_3(2*cn*sizeof(sort_data), - cn*sizeof(uint)+cn*dof_mi*sizeof(sint), - cn*sizeof(uint)+cn*dof_ml*sizeof(slong))); - p=buf->ptr; - index_sort((uint*)&dof->vi[dof_level],cn,dof_mi, p, buf->ptr); - tuple_list_permute(dof,p,p+cn); - pi = p+cn; for(i=0;i=0) perm[i]=pi[perm[i]]; - -#if 0 - printf("id %d:",crystal->id); - for(i=0;iid); - for(i=0;iid); - for(i=0;ivl[i]); - printf("\n"); - printf("level %d:",crystal->id); - for(i=0;ivi[dof_mi*i+dof_level]); - printf("\n"); - printf("count %d:",crystal->id); - for(i=0;ivi[dof_mi*i+dof_count]); - printf("\n"); -#endif -} - -static real inner(const real *u, const real *v, unsigned n) -{ - const real *u_end = u+n; - real sum = 0; - while(u!=u_end) { sum += *u++ * *v++; } - return sum; -} - -/* vl += A_ls * vs */ -static void apply_p_Als(real *vl, xxt_data *data, const real *vs, uint ns) -{ - const uint *Arp = data->A_sl.Arp, - *Aj = data->A_sl.Aj; - const real *A = data->A_sl.A; - uint i,p,pe; - for(i=0;iA_sl.Arp, - *Aj = data->A_sl.Aj; - const real *A = data->A_sl.A; - uint i,p,pe; - for(i=0;iln; - const uint *Asl_rp = data->A_sl.Arp, *Ass_rp = A_ss->Arp, - *Asl_j = data->A_sl.Aj, *Ass_j = A_ss->Aj; - const real *Asl = data->A_sl.A, *Ass = A_ss->A; - uint i,p,pe; - for(i=0;i=ei) break; - vs[j]=-Ass[p]; - } - for(i=0;ifac_A_ll,vl); - apply_m_Asl(vs,ei,data,vl); -} - -static void apply_S(real *Svs, uint ns, xxt_data *data, csr_mat *A_ss, - const real *vs, real *vl) -{ - const uint ln=data->ln; - const uint *Ass_rp = A_ss->Arp, - *Ass_j = A_ss->Aj; - const real *Ass = A_ss->A; - uint i, p,pe; - for(i=0;i=ns) break; - sum+=Ass[p]*vs[j]; - } - Svs[i]=sum; - } - for(i=0;ifac_A_ll,vl); - apply_m_Asl(Svs,ns,data,vl); -} - -/* vx = X' * vs */ -static void apply_Xt(real *vx, uint nx, const xxt_data *data, const real *vs) -{ - const real *X = data->X; const uint *Xp = data->Xp; - uint i; for(i=0;iX; const uint *Xp = data->Xp; - uint i,j; - for(i=0;ixn; - uint i,h=0; - if(data->null_space && xn) --xn; - data->Xp = tmalloc(uint,xn+1); - data->Xp[0]=0; - for(i=0;iXp[i+1]=data->Xp[i]+h; - } - data->X = tmalloc(real,data->Xp[xn]); -} - -static void orthogonalize(xxt_data *data, csr_mat *A_ss, sint *perm_x2c, - buffer *buf) -{ - uint ln=data->ln, sn=data->sn, xn=data->xn; - real *vl, *vs, *vx, *Svs; - uint i,j; - - allocate_X(data,perm_x2c); - - buffer_reserve(buf,(ln+2*sn+xn)*sizeof(real)); - vl=buf->ptr, vs=vl+ln, Svs=vs+sn, vx=Svs+sn; - - if(data->null_space && xn) --xn; - for(i=0;iXp[i+1]-data->Xp[i]; - sint ui = perm_x2c[i]; - real ytsy, *x; - /*if(data->pid==0) printf("xxt : column %d %d\n",i,data->pid);*/ - if(ui == -1) { - for(j=0;jX[data->Xp[i]]; - for(j=0;jn; - tuple_list_sort(mat,1,buf); - tuple_list_sort(mat,0,buf); - - p = (uint*)mat->vi; - for(k=0;k+1vr[k-1]; - const real *qr = &mat->vr[k]; - const uint *q = p+2; - for(;kn; - else p+=2, p[0]=i=q[0],p[1]=j=q[1], *(++pr)=*qr; - } - } - - nz=mat->n; - out->n=nr; - out->Arp = tmalloc(uint,nr+1+mat->n); - out->Aj = out->Arp+nr+1; - out->A = tmalloc(real,mat->n); - for(k=0;kArp[k]=0; - for(k=0;kArp[mat->vi[2*k]]++, - out->Aj[k]=mat->vi[2*k+1], - out->A[k]=mat->vr[k]; - nz=0; for(k=0;k<=nr;++k) { uint t=out->Arp[k]; out->Arp[k]=nz, nz+=t; } -} - -static void separate_matrix( - uint nz, const uint *Ai, const uint *Aj, const real *A, - const sint *perm, uint ln, uint sn, - csr_mat *out_ll, csr_mat *out_sl, csr_mat *out_ss, - buffer *buf -) -{ - uint k,n; - tuple_list mat_ll, mat_sl, mat_ss; - tuple_list_init_max(&mat_ll,2,0,1,2*nz); - tuple_list_init_max(&mat_sl,2,0,1,2*nz); - tuple_list_init_max(&mat_ss,2,0,1,2*nz); - for(k=0;kcomm,&data->comm); - data->pid = crystal->id; - data->np = crystal->num; - locate_proc(data); - - data->null_space=null_space; - - discover_dofs(data,n,id,&dof,crystal); - discover_sep_sizes(data,&dof,&crystal->all->buf); -#if 0 - if(data->pid>=0) - printf("xxt %d: un=%d cn=%d ln=%d sn=%d xn=%d proc=%d\n", - data->pid, data->un, data->cn, data->ln, data->sn, data->xn, data->pid); -#endif - perm_x2c = discover_sep_ids(data,&dof,&crystal->all->buf); - if(data->null_space) { - uint i; real count = 0; - for(i=0;icn;++i) count+=1/(real)dof.vi[dof_mi*i+dof_count]; - count=1/sum(data,count,data->xn,0); - data->share_weight=tmalloc(real,data->cn); - for(i=0;icn;++i) - data->share_weight[i]=count/dof.vi[dof_mi*i+dof_count]; - } - tuple_list_free(&dof); - - if(!data->null_space || data->xn!=0) { - separate_matrix(nz,Ai,Aj,A,data->perm_u2c, - data->ln,data->sn, - &A_ll,&data->A_sl,&A_ss, - &crystal->all->buf); - } else { - separate_matrix(nz,Ai,Aj,A,data->perm_u2c, - data->ln-1,1, - &A_ll,&data->A_sl,&A_ss, - &crystal->all->buf); - } - -#if 0 - if(data->pid==1) { - csr_mat *m[] = {&A_ll,&data->A_sl, &A_ss}; - char *name[] = {"A_ll", "A_sl","A_ss"}; - uint mi,i,p,pe; - for(mi=0;mi<3;++mi) { - printf("%s:\n", name[mi]); - for(i=0;in;++i) { - for(p=m[mi]->Arp[i],pe=m[mi]->Arp[i+1];p!=pe;++p) - printf(" (%d,%d) %g\n",i,m[mi]->Aj[p],m[mi]->A[p]); - } - } - } -#endif - - sparse_cholesky_factor(A_ll.n,A_ll.Arp,A_ll.Aj,A_ll.A, - &data->fac_A_ll, &crystal->all->buf); - free(A_ll.Arp); free(A_ll.A); - - data->vl = tmalloc(real,(data->ln+data->cn+2*data->xn)*sizeof(real)); - data->vc = data->vl+data->ln; - data->vx = data->vc+data->cn; - data->combuf = data->vx+data->xn; - - orthogonalize(data,&A_ss,perm_x2c,&crystal->all->buf); - free(A_ss.Arp); free(A_ss.A); - free(perm_x2c); - - return data; -} - -void xxt_solve(real *x, xxt_data *data, const real *b) -{ - uint cn=data->cn, un=data->un, ln=data->ln, sn=data->sn, xn=data->xn; - real *vl=data->vl, *vc=data->vc, *vx=data->vx; - uint i; - for(i=0;iperm_u2c[i]; - if(p>=0) vc[p]+=b[i]; - } - if(xn>0 && (!data->null_space || xn>1)) { - if(data->null_space) --xn; - sparse_cholesky_solve(vc,&data->fac_A_ll,vc); - apply_m_Asl(vc+ln,sn, data, vc); - apply_Xt(vx,xn, data, vc+ln); - apply_QQt(data,vx,xn,0); - apply_X(vc+ln,sn, data, vx,xn); - for(i=0;ifac_A_ll,vl); - for(i=0;ifac_A_ll,vc); - if(data->null_space) { - if(xn==0) vc[ln-1]=0; - else if(sn==1) vc[ln]=0; - } - } - if(data->null_space) { - real s=0; - for(i=0;ishare_weight[i]*vc[i]; - s = sum(data,s,data->xn,0); - for(i=0;iperm_u2c[i]; - x[i] = p>=0 ? vc[p] : 0; - } -} - -void xxt_stats(xxt_data *data) -{ - int a,b; uint xcol; - if(data->pid==0) { - unsigned s; - printf("xxt: separator sizes on %d =",data->pid); - for(s=0;snsep;++s) printf(" %d",(int)data->sep_size[s]); - printf("\n"); - printf("xxt: shared dofs on %d = %d\n",data->pid,data->sn); - } - a=data->ln; - MPI_Reduce(&a,&b,1,MPI_INT,MPI_MAX,0,data->comm); - if(data->pid==0) printf("xxt: max non-shared dofs = %d\n",b); - a=data->sn; - MPI_Reduce(&a,&b,1,MPI_INT,MPI_MAX,0,data->comm); - if(data->pid==0) printf("xxt: max shared dofs = %d\n",b); - xcol=data->xn; if(xcol&&data->null_space) --xcol; - a=xcol; - MPI_Reduce(&a,&b,1,MPI_INT,MPI_MAX,0,data->comm); - if(data->pid==0) printf("xxt: max X cols = %d\n",b); - a=data->Xp[xcol]*sizeof(real); - MPI_Reduce(&a,&b,1,MPI_INT,MPI_MAX,0,data->comm); - if(data->pid==0) printf("xxt: max X size = %d bytes\n",b); -} - -void xxt_free(xxt_data *data) -{ - MPI_Comm_free(&data->comm); - free(data->pother); - free(data->mpireq); - free(data->mpistatus); - free(data->sep_size); - free(data->perm_u2c); - if(data->null_space) free(data->share_weight); - sparse_cholesky_free(&data->fac_A_ll); - free(data->A_sl.Arp); free(data->A_sl.A); - free(data->Xp); free(data->X); - free(data->vl); - free(data); -} - -#else - -xxt_data *xxt_setup(uint n, const ulong *id, - uint nz, const uint *Ai, const uint *Aj, const real *A, - uint null_space, void *crystal) -{ - xxt_data *data = tmalloc(xxt_data,1); - buffer buf; - csr_mat A_ll, A_sl, A_ss; - - buffer_init(&buf,1024); - data->null_space=null_space; - data->un = n; - data->perm_u2c = tmalloc(sint,n); - data->cn = unique_ids(n,id,data->perm_u2c,&buf); - - separate_matrix(nz,Ai,Aj,A,data->perm_u2c, - data->cn-(null_space?1:0),(null_space?1:0), - &A_ll,&A_sl,&A_ss, - &buf); -#if 0 - if(data->pid==0) { - csr_mat *m[] = {&A_ll, &A_sl, &A_ss}; - char *name[] = {"A_ll","A_sl","A_ss"}; - uint mi,i,p,pe; - for(mi=0;mi<3;++mi) { - printf("%s:\n", name[mi]); - for(i=0;in;++i) { - for(p=m[mi]->Arp[i],pe=m[mi]->Arp[i+1];p!=pe;++p) - printf(" (%d,%d) %g\n",i,m[mi]->Aj[p],m[mi]->A[p]); - } - } - } -#endif - - sparse_cholesky_factor(A_ll.n,A_ll.Arp,A_ll.Aj,A_ll.A, - &data->fac_A_ll, &buf); - free(A_ll.Arp); free(A_ll.A); - free(A_sl.Arp); free(A_sl.A); - free(A_ss.Arp); free(A_ss.A); - - data->vc = tmalloc(real,data->cn*sizeof(real)); - - buffer_free(&buf); - - return data; -} - -void xxt_solve(real *x, xxt_data *data, const real *b) -{ - uint cn=data->cn, un=data->un; - real *vc=data->vc; - uint i; - for(i=0;iperm_u2c[i]; - if(p>=0) vc[p]+=b[i]; - } - sparse_cholesky_solve(vc,&data->fac_A_ll,vc); - if(data->null_space) { - real s=0; - vc[cn-1]=0; - for(i=0;iperm_u2c[i]; - x[i] = p>=0 ? vc[p] : 0; - } -} - -void xxt_stats(xxt_data *data) -{ - printf("xxt: separator sizes on 0 = %d\n",data->cn); -} - -void xxt_free(xxt_data *data) -{ - free(data->perm_u2c); - sparse_cholesky_free(&data->fac_A_ll); - free(data->vc); - free(data); -} - -#endif - -/*-------------------------------------------------------------------------- - FORTRAN Interface - --------------------------------------------------------------------------*/ - -#include "fname.h" - -#define xxtsetup FORTRAN_NAME(xxtsetup,XXTSETUP) -#define xxtsolve FORTRAN_NAME(xxtsolve,XXTSOLVE) -#define xxtstats FORTRAN_NAME(xxtstats,XXTSTATS) -#define xxtfree FORTRAN_NAME(xxtfree ,XXTFREE) - -static xxt_data **handle_array = 0; -static int handle_max = 0; -static int handle_n = 0; - -#ifdef MPI -crystal_data *fcrystal_handle(sint h); -#endif - -void xxtsetup(sint *handle, const sint *crystal_handle, - const sint *n, const slong id[], - const sint *nz, const sint Ai[], const sint Aj[], const real A[], - const sint *null_space) -{ -#ifdef MPI - crystal_data *crystal = fcrystal_handle(*crystal_handle); -#else - void *crystal = 0; -#endif - if(handle_n==handle_max) - handle_max+=handle_max/2+1, - handle_array=trealloc(xxt_data*,handle_array,handle_max); - handle_array[handle_n]=xxt_setup(*n,(const ulong*)id, - *nz,(const uint*)Ai,(const uint*)Aj,A, - *null_space,crystal); - *handle = handle_n++; -} - -void xxtsolve(const sint *handle, real x[], const real b[]) -{ - if(*handle<0 || *handle>=handle_n || !handle_array[*handle]) - failwith("invalid handle to xxtsolve"); - xxt_solve(x,handle_array[*handle],b); -} - -void xxtstats(const sint *handle) -{ - if(*handle<0 || *handle>=handle_n || !handle_array[*handle]) - failwith("invalid handle to xxtstats"); - xxt_stats(handle_array[*handle]); -} - -void xxtfree(sint *handle) -{ - if(*handle<0 || *handle>=handle_n || !handle_array[*handle]) - failwith("invalid handle to xxtfree"); - xxt_free(handle_array[*handle]); - handle_array[*handle] = 0; -} - - diff --git a/3rdParty/gslib.github/src/xxt_test.c b/3rdParty/gslib.github/src/xxt_test.c deleted file mode 100644 index 432ac58d1..000000000 --- a/3rdParty/gslib.github/src/xxt_test.c +++ /dev/null @@ -1,136 +0,0 @@ -#include -#include -#include -#include "name.h" -#include "fail.h" -#include "types.h" -#include "comm.h" -#include "crs.h" - -/* - - +---+---+ 1---7---4 - | 0 | 1 | | | | - +---+---+ 3---8---6 - | 0 | 2 | | | | - +---+---+ 2---9---5 - - - element laplacian matrix: - - 4 -1 -1 -2 - -1 4 -2 -1 - -1 -2 4 -1 - -2 -1 -1 4 - - assembled matrix: - - - 4 -1 -1 -2 - 4 -1 -2 -1 - -1 -1 8 -2 -2 -2 - 4 -1 -1 -2 - 4 -1 -2 -1 - -1 -1 8 -2 -2 -2 - -1 -2 -1 -2 8 -2 - -2 -2 -2 -2 -2 -2 -2 16 -2 - -1 -2 -1 -2 -2 8 - -*/ - -const uint nx[3] = {8,4,4}; -/* -const ulong x_id[3][8] = { {0,7,3,8, 3,8,2,9}, - {7,4,8,6}, - {8,6,9,5} }; -*/ -const ulong x_id[3][8] = { {0,2,4,5, 4,5,7,8}, - {2,3,5,6}, - {5,6,8,9} }; - - double bv[3][8][8] = { { {0,1/2.,0,0,0,0,0,0}, - {0,0,0,0,0,0,0,0}, - {0,0,1/2.,0,1/2.,0,0,0}, - {0,0,0,1/4.,0,1/4.,0,0}, - {0,0,0,0,0,0,0,0}, - {0,0,0,0,0,0,1,0}, - {0,0,0,0,0,0,0,1/2.}, - {0,0,0,0,0,0,0,0} }, - - { {1/2.,0,0,0}, - {0,1,0,0}, - {0,0,0,0}, - {0,0,1/4.,0}, - {0,0,0,1/2.}, - {0,0,0,0}, - {0,0,0,0}, - {0,0,0,0} }, - - { {0,0,0,0}, - {0,0,0,0}, - {0,0,0,0}, - {1/4.,0,0,0}, - {0,1/2.,0,0}, - {0,0,0,0}, - {0,0,1/2.,0}, - {0,0,0,1} } }; - -const uint nz[3] = {32,16,16}; -const -uint Ai[3][32] = { {0,0,0,0, 1,1,1,1, 2,2,2,2, 3,3,3,3, - 4,4,4,4, 5,5,5,5, 6,6,6,6, 7,7,7,7}, - {0,0,0,0, 1,1,1,1, 2,2,2,2, 3,3,3,3}, - {0,0,0,0, 1,1,1,1, 2,2,2,2, 3,3,3,3} }; -const -uint Aj[3][32] = { {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3, - 4,5,6,7, 4,5,6,7, 4,5,6,7, 4,5,6,7}, - {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3}, - {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3} }; -const -double Ar[3][32] = { { 4,-1,-1,-2, -1,4,-2,-1, -1,-2,4,-1, -2,-1,-1,4, - 4,-1,-1,-2, -1,4,-2,-1, -1,-2,4,-1, -2,-1,-1,4 }, - { 4,-1,-1,-2, -1,4,-2,-1, -1,-2,4,-1, -2,-1,-1,4 }, - { 4,-1,-1,-2, -1,4,-2,-1, -1,-2,4,-1, -2,-1,-1,4 } }; - -int main(int narg, char* arg[]) -{ - struct crs_data *crs; - comm_ext world; int id,np; - struct comm comm; -#ifdef MPI - MPI_Init(&narg,&arg); - world = MPI_COMM_WORLD; - MPI_Comm_size(world,&np); -#else - world=0, np=1; -#endif - - comm_init(&comm,world); - if(np!=3) { puts("run with 3 procs"); exit(1); } - id = comm.id; - - crs = crs_setup(nx[id], &x_id[id][0], - nz[id], &Ai[id][0], &Aj[id][0], &Ar[id][0], - 0, &comm); - - crs_stats(crs); - - if(1) { - uint i,j; double xv[8]; - for(i=0;i<8;++i) { - crs_solve(xv,crs,&bv[id][i][0]); - printf("%d col %d:",id,i); - for(j=0;j -#include -#include -#include -#include -#include "c99.h" -#include "name.h" -#include "fail.h" -#include "types.h" -#include "comm.h" -#include "mem.h" -#include "crs.h" - -#define M 3 - -int main(int narg, char* arg[]) -{ - uint n; ulong *xid; - uint nz; uint *Ai, *Aj; double *A; - uint i; - double *x, *b, *x2; - - struct crs_data *crs; - comm_ext world; int id,np; - struct comm comm; -#ifdef MPI - MPI_Init(&narg,&arg); - world = MPI_COMM_WORLD; - MPI_Comm_size(world,&np); -#else - world=0, np=1; -#endif - - comm_init(&comm,world); - id = comm.id; - - n=M+1; if(id==np-1) --n; - xid = tmalloc(ulong,n); x=tmalloc(double,3*n), b=x+n, x2=b+n; - for(i=0;i0 || id!=0 ? (xid[i]-1-mean)*(xid[i]-1-mean)-avg : 0) - - (i+1dif) dif=d; - } - printf("%d : max dif = %g\n",id,dif); - } - free(A); free(Ai); free(x); free(xid); - -#ifdef MPI - MPI_Finalize(); -#endif - - if(id==0) printf("test successful\n"); - - return 0; -} - diff --git a/3rdParty/gslib.github/src/xxt_test2.m b/3rdParty/gslib.github/src/xxt_test2.m deleted file mode 100644 index de1c0eb7f..000000000 --- a/3rdParty/gslib.github/src/xxt_test2.m +++ /dev/null @@ -1,97 +0,0 @@ -%p = [4 3 2 1 3 6 1 5 6 5 ] - -%inv(A)(p,p) - -function M=bdiag(A,B,C) - [ra ca]=size(A); - [rb cb]=size(B); - [rc cc]=size(C); - M = [ A zeros(ra,cb) zeros(ra,cc) - zeros(rb,ca) B zeros(rb,cc) - zeros(rc,ca) zeros(rc,cb) C ]; -end - -Al0=[]; -Ac0=zeros(2)([],:); -As0=[1 -.5; -.5 1]; - -Al1=[ 2 -.5 -1 0 - -.5 1 0 -.5 - -1 0 2 -.5 - 0 -.5 -.5 1]; -Ac1=[-.5 0; 0 0; 0 -.5; 0 0]; -As1=[1 -.5; -.5 1]; - -A0=[Al0 Ac0; Ac0' As0]; -A1=[Al1 Ac1; Ac1' As1]; - -Il=eye(4); Is=eye(2); -gI=eye(6); -Rl0=Il([],:); -Rl1=Il([1 2 3 4],:); -Rs0=Is([5 6]-4,:); -Rs1=Is([5 6]-4,:); -R0=[Rl0 zeros(size(Rl0)(1),size(Rs0)(2)) - zeros(size(Rs0)(1),size(Rl0)(2)) Rs0]; -R1=[Rl1 zeros(size(Rl1)(1),size(Rs1)(2)) - zeros(size(Rs1)(1),size(Rl1)(2)) Rs1]; - -A=R0'*A0*R0+R1'*A1*R1; - -All = bdiag(Al0,Al1,[]) -Als = bdiag(Ac0,Ac1,[]) -Ass = bdiag(As0,As1,[]) -Qs = [Rs0;Rs1] -ns = size(Qs)(1); - -Q = bdiag(Il,Qs,[]); - -A=[All Als*Qs; Qs'*Als' Qs'*Ass*Qs]; - -M = Q*(A\Q'); - -S0 = As0; -S1 = As1-Ac1'*(Al1\Ac1); - -dS = bdiag(S0,S1,[]); - -em=[Il -All\Als; 0*Als' eye(ns)]; -norm(M-em*[inv(All) 0*Als; 0*Als' Qs*inv(Qs'*dS*Qs)*Qs']*em') - -Rf0=Is([5 6]-4,:); -Rf1=Is([5 6]-4,:); -Qf = [Rf0;Rf1]; - -X0 = zeros(size(Rs0)(1),size(Rf0)(1)); -X1 = zeros(size(Rs1)(1),size(Rf1)(1)); -dX = bdiag(X0,X1,[]); -X = Qs'*dX*Qf; - -X0 = Rs0*X*Rf0'; - -n = size(Qs)(2) -for i = [1:n] - ei = eye(n)(:,i); - Ri = eye(n)([1:i-1],:); - se = dS*Qs*ei - Xtse = dX'*se - QQtXtse = Qf*Ri'*Ri*Qf'*Xtse - Qy = Qs*ei - dX*QQtXtse - ytsy = Qy'*dS*Qy - Qx = Qy/sqrt(ytsy) - xv = inv(Qs'*Qs)*Qs'*Qx; - X(:,i)=xv - X0 = Rs0*X*Rf0' - X1 = Rs1*X*Rf1' - dX = bdiag(X0,X1,[]); - pause -end - -norm(M-em*[inv(All) 0*Als; 0*Als' Qs*X*X'*Qs']*em') -norm(M-em*[inv(All) 0*Als; 0*Als' dX*Qf*Qf'*dX']*em') - -X - -inv(chol(A)) - - diff --git a/3rdParty/gslib/.travis.yml b/3rdParty/gslib/.travis.yml new file mode 100644 index 000000000..939c88fb2 --- /dev/null +++ b/3rdParty/gslib/.travis.yml @@ -0,0 +1,36 @@ +language: c + +before_install: + - export ROOT_DIR=`pwd` + - sudo apt-get update -qq + - sudo apt-get install -y mpich2 libmpich2-dev + +env: + matrix: + - TEST=crystal_test NP=2 + - TEST=findpts_el_2_test NP=2 + - TEST=findpts_el_2_test2 NP=2 + - TEST=findpts_el_3_test NP=2 + - TEST=findpts_el_3_test2 NP=2 + - TEST=findpts_el_2_test2 NP=2 + - TEST=findpts_local_test NP=2 + - TEST=findpts_test NP=2 + - TEST=gs_test NP=2 + - TEST=gs_test_old NP=2 + - TEST=gs_test_gop_nonblocking NP=2 + - TEST=gs_test_gop_blocking NP=2 + - TEST=gs_unique_test NP=2 + - TEST=lob_bnd_test NP=2 + - TEST=obbox_test NP=2 + - TEST=poly_test NP=2 + - TEST=sarray_sort_test NP=2 + - TEST=sarray_transfer_test NP=2 + - TEST=sort_test NP=2 + - TEST=sort_test2 NP=2 + +install: true + +script: + - cd $ROOT_DIR + - make CC=mpicc tests/$TEST + - mpiexec -np $NP ./tests/$TEST diff --git a/3rdParty/gslib/LICENSE b/3rdParty/gslib/LICENSE new file mode 100644 index 000000000..d0904265f --- /dev/null +++ b/3rdParty/gslib/LICENSE @@ -0,0 +1,58 @@ +Copyright (c) 2008-2017, UCHICAGO ARGONNE, LLC. + +The UChicago Argonne, LLC as Operator of Argonne National +Laboratory holds copyright in the Software. The copyright holder +reserves all rights except those expressly granted to licensees, +and U.S. Government license rights. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the disclaimer below. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the disclaimer (as noted below) +in the documentation and/or other materials provided with the +distribution. + +3. Neither the name of ANL nor the names of its contributors +may be used to endorse or promote products derived from this software +without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL +UCHICAGO ARGONNE, LLC, THE U.S. DEPARTMENT OF +ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Additional BSD Notice +--------------------- +1. This notice is required to be provided under our contract with +the U.S. Department of Energy (DOE). This work was produced at +Argonne National Laboratory under Contract +No. DE-AC02-06CH11357 with the DOE. + +2. Neither the United States Government nor UCHICAGO ARGONNE, +LLC nor any of their employees, makes any warranty, +express or implied, or assumes any liability or responsibility for the +accuracy, completeness, or usefulness of any information, apparatus, +product, or process disclosed, or represents that its use would not +infringe privately-owned rights. + +3. Also, reference herein to any specific commercial products, process, +or services by trade name, trademark, manufacturer or otherwise does +not necessarily constitute or imply its endorsement, recommendation, +or favoring by the United States Government or UCHICAGO ARGONNE LLC. +The views and opinions of authors expressed +herein do not necessarily state or reflect those of the United States +Government or UCHICAGO ARGONNE, LLC, and shall +not be used for advertising or product endorsement purposes. diff --git a/3rdParty/gslib/Makefile b/3rdParty/gslib/Makefile new file mode 100644 index 000000000..8aa762c13 --- /dev/null +++ b/3rdParty/gslib/Makefile @@ -0,0 +1,127 @@ +MPI ?= 1 +MPIIO ?= 1 +ADDUS ?= 1 +USREXIT ?= 0 +LIBNAME ?= gs +BLAS ?= 0 +CFLAGS ?= -O2 +FFLAGS ?= -O2 + +SRCROOT=. +TESTDIR=$(SRCROOT)/tests +FTESTDIR=$(TESTDIR)/fortran +SRCDIR=$(SRCROOT)/src +INCDIR=$(SRCROOT)/src +LIBDIR=$(SRCROOT)/lib + +ifneq (,$(strip $(PREFIX))) +INSTALL_ROOT = $(PREFIX) +else +INSTALL_ROOT = $(LIBDIR) +endif + +ifneq (0,$(MPI)) + G+=-DMPI +endif + +ifneq (0,$(MPIIO)) + ifneq (0,$(MPI)) + G+=-DUSEMPIIO + endif +endif + +ifneq (0,$(ADDUS)) + G+=-DUNDERSCORE +endif + +ifneq (0,$(USREXIT)) + G+=-DUSE_USR_EXIT +endif + +ifeq (0,$(BLAS)) + G+=-DUSE_NAIVE_BLAS +endif + +ifeq (1,$(BLAS)) + G+=-DUSE_CBLAS +endif + +ifneq ($(PREFIX),) + G+=-DPREFIX=$(PREFIX) +endif + +ifneq ($(FPREFIX),) + G+=-DFPREFIX=$(FPREFIX) +endif + +G+=-DGLOBAL_LONG_LONG +#G+=-DPRINT_MALLOCS=1 +#G+=-DGS_TIMING -DGS_BARRIER + +CCCMD=$(CC) $(CFLAGS) -I$(INCDIR) $(G) +FCCMD=$(FC) $(FFLAGS) + +LINKCMD=$(CC) $(CFLAGS) -I$(INCDIR) $(G) $^ -o $@ -L$(SRCDIR) \ + -l$(LIBNAME) -lm $(LDFLAGS) + +TESTS=$(TESTDIR)/sort_test $(TESTDIR)/sort_test2 $(TESTDIR)/sarray_sort_test \ + $(TESTDIR)/comm_test $(TESTDIR)/crystal_test \ + $(TESTDIR)/sarray_transfer_test $(TESTDIR)/gs_test \ + $(TESTDIR)/gs_test_gop_blocking $(TESTDIR)/gs_test_gop_nonblocking \ + $(TESTDIR)/gs_unique_test $(TESTDIR)/gs_test_old \ + $(TESTDIR)/findpts_el_2_test \ + $(TESTDIR)/findpts_el_2_test2 $(TESTDIR)/findpts_el_3_test \ + $(TESTDIR)/findpts_el_3_test2 $(TESTDIR)/findpts_local_test \ + $(TESTDIR)/findpts_test $(TESTDIR)/poly_test \ + $(TESTDIR)/lob_bnd_test $(TESTDIR)/obbox_test + +FTESTS=$(FTESTDIR)/f-igs + +GS=$(SRCDIR)/gs.o $(SRCDIR)/sort.o $(SRCDIR)/sarray_transfer.o \ + $(SRCDIR)/sarray_sort.o $(SRCDIR)/gs_local.o $(SRCDIR)/fail.o \ + $(SRCDIR)/crystal.o $(SRCDIR)/comm.o $(SRCDIR)/tensor.o + +FWRAPPER=$(SRCDIR)/fcrystal.o $(SRCDIR)/findpts.o +INTP=$(SRCDIR)/findpts_local.o $(SRCDIR)/obbox.o $(SRCDIR)/poly.o \ + $(SRCDIR)/lob_bnd.o $(SRCDIR)/findpts_el_3.o $(SRCDIR)/findpts_el_2.o + +.PHONY: all lib install deps tests clean objects odepinfo + +all : lib tests + +lib: $(GS) $(FWRAPPER) $(INTP) $(SRCDIR)/rand_elt_test.o + @$(AR) cr $(SRCDIR)/lib$(LIBNAME).a $? + @ranlib $(SRCDIR)/lib$(LIBNAME).a + +install: lib + @mkdir -p $(INSTALL_ROOT) 2>/dev/null + @cp -v $(SRCDIR)/lib$(LIBNAME).a $(INSTALL_ROOT) 2>/dev/null + +tests: $(TESTS) $(FTESTS) + +clean: ; @$(RM) $(SRCDIR)/*.o $(SRCDIR)/*.s $(SRCDIR)/*.a $(TESTS) $(TESTS)/*.o $(FTESTS) $(FTESTS)/*.o + +cmds: ; @echo CC = $(CCCMD); echo LINK = $(LINKCMD); + +deps: ; ./cdep.py *.c > makefile.cdep; + +odepinfo: deps objects; @./odep_info.py *.o + +$(TESTS): % : %.o | lib + $(LINKCMD) + +$(FTESTS): % : %.o | lib + $(FCCMD) $^ -o $@ -L$(SRCDIR) -l$(LIBNAME) + +-include makefile.cdep + +%.o: %.c ; $(CCCMD) -c $< -o $@ +%.o: %.f ; $(FCCMD) -c $< -o $@ +%.s: %.c ; $(CCCMD) -S $< -o $@ +objects: $(OBJECTS) ; + +#poly_imp.h: gen_poly_imp.c +# $(RM) poly_imp.h; +# $(CC) -lgmp -lm gen_poly_imp.c -o gen_poly_imp; +# ./gen_poly_imp > poly_imp.h; +# $(RM) gen_poly_imp diff --git a/3rdParty/gslib/README.md b/3rdParty/gslib/README.md new file mode 100644 index 000000000..8ded873a3 --- /dev/null +++ b/3rdParty/gslib/README.md @@ -0,0 +1,23 @@ +# GSLIB + +[![Build Status](https://travis-ci.org/gslib/gslib.svg?branch=master)](https://travis-ci.org/gslib/gslib) + +* Scalable Many-to-Many collectives +* Robust interpolation for hexahedral spectral element meshes + +# Build Instructions + +The build system relies on GNU Make with the `make` command. To compile gslib just run: + +``` +make CC=mpicc FC=mpif77 +make PREFIX= install +``` + +# Applications + +**\[1] [Nek5000](https://nek5000.mcs.anl.gov/)**: Nek5000 open-source, spectral element code. + +**\[2] [CEED](http://ceed.exascaleproject.org/)**: Co-design center for Efficient Exascale Discretizations. + +**\[3] [Nektar++](http://www.nektar.info)**: Nektar++ open-source spectral/hp element code. diff --git a/3rdParty/gslib/RELEASE.md b/3rdParty/gslib/RELEASE.md new file mode 100644 index 000000000..fdaf06f49 --- /dev/null +++ b/3rdParty/gslib/RELEASE.md @@ -0,0 +1,17 @@ +# Release 1.0.3 + +## Major Features and Improvements +* Added non-blocking gather/scatter operations (CR not supported yet) +* Added Fortran wrapper for gs_unique +* Added gs_hf2c to convert Fortran into C handle + +## Backwards-Incompatible Changes +* Removed XXT and AMG solver from distribution + +## Bug Fixes and Other Changes + +[17](https://github.com/gslib/gslib/issues/17) + +## Thanks to our Contributors +This release contains contributions from: @stgeke +We are also grateful to all who filed issues or helped resolve them, asked and answered questions, and were part of inspiring discussions. diff --git a/3rdParty/gslib.github/src/cdep.py b/3rdParty/gslib/cdep.py similarity index 100% rename from 3rdParty/gslib.github/src/cdep.py rename to 3rdParty/gslib/cdep.py diff --git a/3rdParty/gslib.github/src/makefile.cdep b/3rdParty/gslib/makefile.cdep similarity index 68% rename from 3rdParty/gslib.github/src/makefile.cdep rename to 3rdParty/gslib/makefile.cdep index ae6672b97..e5c6ee766 100644 --- a/3rdParty/gslib.github/src/makefile.cdep +++ b/3rdParty/gslib/makefile.cdep @@ -9,10 +9,10 @@ fcrs.o: fcrs.c crs.h comm.h mem.h types.h fail.h name.h c99.h fcrystal.o: fcrystal.c sarray_transfer.h sarray_sort.h sort.h crystal.h comm.h mem.h types.h fail.h name.h c99.h findpts.o: findpts.c findpts_imp.h findpts_imp.h sarray_sort.h sort.h sarray_transfer.h crystal.h comm.h gs_defs.h findpts_local.h findpts_el.h obbox.h poly.h mem.h fail.h types.h name.h c99.h findpts_el_2.o: findpts_el_2.c poly.h tensor.h mem.h types.h fail.h name.h c99.h -findpts_el_2_test2.o: findpts_el_2_test2.c rdtsc.h rand_elt_test.h findpts_el.h obbox.h lob_bnd.h poly.h tensor.h mem.h fail.h name.h types.h c99.h +findpts_el_2_test2.o: findpts_el_2_test2.c rand_elt_test.h findpts_el.h obbox.h lob_bnd.h poly.h tensor.h mem.h fail.h name.h types.h c99.h findpts_el_2_test.o: findpts_el_2_test.c findpts_el.h poly.h mem.h fail.h types.h name.h c99.h findpts_el_3.o: findpts_el_3.c poly.h tensor.h mem.h types.h fail.h name.h c99.h -findpts_el_3_test2.o: findpts_el_3_test2.c rdtsc.h rand_elt_test.h findpts_el.h obbox.h lob_bnd.h poly.h tensor.h mem.h fail.h name.h types.h c99.h +findpts_el_3_test2.o: findpts_el_3_test2.c rand_elt_test.h findpts_el.h obbox.h lob_bnd.h poly.h tensor.h mem.h fail.h name.h types.h c99.h findpts_el_3_test.o: findpts_el_3_test.c findpts_el.h poly.h mem.h fail.h types.h name.h c99.h findpts_local.o: findpts_local.c findpts_local_imp.h findpts_local_imp.h findpts_el.h sarray_sort.h sort.h poly.h obbox.h mem.h fail.h name.h types.h c99.h findpts_local_test.o: findpts_local_test.c rand_elt_test.h findpts_local.h findpts_el.h obbox.h poly.h types.h mem.h fail.h name.h c99.h @@ -28,7 +28,6 @@ lob_bnd_test.o: lob_bnd_test.c lob_bnd.h poly.h tensor.h mem.h fail.h name.h typ obbox.o: obbox.c lob_bnd.h poly.h tensor.h mem.h types.h fail.h name.h c99.h obbox_test.o: obbox_test.c rand_elt_test.h obbox.h lob_bnd.h poly.h mem.h fail.h name.h types.h c99.h poly.o: poly.c poly_imp.h mem.h types.h fail.h name.h c99.h -poly_test2.o: poly_test2.c rdtsc.h poly.h mem.h fail.h name.h types.h c99.h poly_test.o: poly_test.c poly.h types.h name.h c99.h rand_elt_test.o: rand_elt_test.c lob_bnd.h poly.h name.h types.h c99.h sarray_sort.o: sarray_sort.c sort.h mem.h fail.h types.h name.h c99.h @@ -36,13 +35,8 @@ sarray_sort_test.o: sarray_sort_test.c sarray_sort.h sort.h mem.h types.h fail.h sarray_transfer.o: sarray_transfer.c sort.h crystal.h mem.h comm.h types.h fail.h name.h c99.h sarray_transfer_test.o: sarray_transfer_test.c sarray_transfer.h crystal.h sarray_sort.h sort.h mem.h comm.h types.h fail.h name.h c99.h sort.o: sort.c sort_imp.h sort_imp.h sort_imp.h mem.h types.h fail.h name.h c99.h -sort_test2.o: sort_test2.c rdtsc.h sort.h mem.h types.h fail.h name.h c99.h +sort_test2.o: sort_test2.c sort.h mem.h types.h fail.h name.h c99.h sort_test.o: sort_test.c sort.h mem.h types.h fail.h name.h c99.h -sparse_cholesky.o: sparse_cholesky.c sort.h mem.h types.h fail.h name.h c99.h -spchol_test.o: spchol_test.c sparse_cholesky.h mem.h types.h fail.h name.h c99.h tensor.o: tensor.c types.h name.h c99.h -xxt.o: xxt.c gs.h sparse_cholesky.h sarray_sort.h sort.h mem.h comm.h gs_defs.h tensor.h types.h fail.h name.h c99.h -xxt_test2.o: xxt_test2.c crs.h mem.h comm.h types.h fail.h name.h c99.h -xxt_test.o: xxt_test.c crs.h comm.h types.h fail.h name.h -OBJECTS= amg.o comm.o comm_test.o crs_test.o crystal.o crystal_test.o fail.o fcrs.o fcrystal.o findpts.o findpts_el_2.o findpts_el_2_test2.o findpts_el_2_test.o findpts_el_3.o findpts_el_3_test2.o findpts_el_3_test.o findpts_local.o findpts_local_test.o findpts_test.o gen_poly_imp.o gs.o gs_local.o gs_test.o gs_test_old.o gs_unique_test.o lob_bnd.o lob_bnd_test.o obbox.o obbox_test.o poly.o poly_test2.o poly_test.o rand_elt_test.o sarray_sort.o sarray_sort_test.o sarray_transfer.o sarray_transfer_test.o sort.o sort_test2.o sort_test.o sparse_cholesky.o spchol_test.o tensor.o xxt.o xxt_test2.o xxt_test.o +OBJECTS= comm.o comm_test.o crs_test.o crystal.o crystal_test.o fail.o fcrs.o fcrystal.o findpts.o findpts_el_2.o findpts_el_2_test2.o findpts_el_2_test.o findpts_el_3.o findpts_el_3_test2.o findpts_el_3_test.o findpts_local.o findpts_local_test.o findpts_test.o gen_poly_imp.o gs.o gs_local.o gs_test.o gs_test_old.o gs_unique_test.o lob_bnd.o lob_bnd_test.o obbox.o obbox_test.o poly.o poly_test2.o poly_test.o rand_elt_test.o sarray_sort.o sarray_sort_test.o sarray_transfer.o sarray_transfer_test.o sort.o sort_test2.o sort_test.o tensor.o diff --git a/3rdParty/gslib.github/src/odep_info.py b/3rdParty/gslib/odep_info.py similarity index 100% rename from 3rdParty/gslib.github/src/odep_info.py rename to 3rdParty/gslib/odep_info.py diff --git a/3rdParty/gslib.github/src/c99.h b/3rdParty/gslib/src/c99.h similarity index 100% rename from 3rdParty/gslib.github/src/c99.h rename to 3rdParty/gslib/src/c99.h diff --git a/3rdParty/gslib.github/src/comm.c b/3rdParty/gslib/src/comm.c similarity index 80% rename from 3rdParty/gslib.github/src/comm.c rename to 3rdParty/gslib/src/comm.c index 41e2d6695..e537278f4 100644 --- a/3rdParty/gslib.github/src/comm.c +++ b/3rdParty/gslib/src/comm.c @@ -31,7 +31,7 @@ static void scan_imp(void *scan, const struct comm *com, gs_dom dom, gs_op op, c<<=1, n>>=1; if(id>=base+n) c|=1, base+=n, n+=(odd&1); } - gs_init_array(scan,vn,dom,op,0); + gs_init_array(scan,vn,dom,op); memcpy(red,v,vsize); while(nc,req); + return; + } +#endif +#ifdef MPI +comm_allreduce_byhand: + allreduce_imp(com,dom,op, v,vn, buf); +#endif +} + double comm_dot(const struct comm *comm, double *v, double *w, uint n) { double s=tensor_dot(v,w,n),b; diff --git a/3rdParty/gslib.github/src/comm.h b/3rdParty/gslib/src/comm.h similarity index 95% rename from 3rdParty/gslib.github/src/comm.h rename to 3rdParty/gslib/src/comm.h index c3140ffde..1bd88264a 100644 --- a/3rdParty/gslib.github/src/comm.h +++ b/3rdParty/gslib/src/comm.h @@ -71,9 +71,10 @@ typedef int comm_req; typedef int MPI_Fint; #endif -#define comm_allreduce PREFIXED_NAME(comm_allreduce) -#define comm_scan PREFIXED_NAME(comm_scan ) -#define comm_dot PREFIXED_NAME(comm_dot ) +#define comm_allreduce PREFIXED_NAME(comm_allreduce ) +#define comm_iallreduce PREFIXED_NAME(comm_iallreduce) +#define comm_scan PREFIXED_NAME(comm_scan ) +#define comm_dot PREFIXED_NAME(comm_dot ) /* global id, np vars strictly for diagnostic messages (fail.c) */ #ifndef comm_gbl_id @@ -108,6 +109,8 @@ double comm_dot(const struct comm *comm, double *v, double *w, uint n); #ifdef GS_DEFS_H void comm_allreduce(const struct comm *com, gs_dom dom, gs_op op, void *v, uint vn, void *buf); +void comm_iallreduce(comm_req *req, const struct comm *com, gs_dom dom, gs_op op, + void *v, uint vn, void *buf); void comm_scan(void *scan, const struct comm *com, gs_dom dom, gs_op op, const void *v, uint vn, void *buffer); diff --git a/3rdParty/gslib/src/crs.h b/3rdParty/gslib/src/crs.h new file mode 100644 index 000000000..eeb60d33c --- /dev/null +++ b/3rdParty/gslib/src/crs.h @@ -0,0 +1,36 @@ +#ifndef CRS_H +#define CRS_H + +#if !defined(COMM_H) +#warning "crs.h" requires "comm.h" +#endif + +#define crs_xxt_setup PREFIXED_NAME(crs_xxt_setup) +#define crs_xxt_solve PREFIXED_NAME(crs_xxt_solve) +#define crs_xxt_stats PREFIXED_NAME(crs_xxt_stats) +#define crs_xxt_free PREFIXED_NAME(crs_xxt_free ) + +#define crs_amg_setup PREFIXED_NAME(crs_amg_setup) +#define crs_amg_solve PREFIXED_NAME(crs_amg_solve) +#define crs_amg_stats PREFIXED_NAME(crs_amg_stats) +#define crs_amg_free PREFIXED_NAME(crs_amg_free ) + +struct crs_data; + +struct crs_data *crs_xxt_setup( + uint n, const ulong *id, + uint nz, const uint *Ai, const uint *Aj, const double *A, + uint null_space, const struct comm *comm); +void crs_xxt_solve(double *x, struct crs_data *data, double *b); +void crs_xxt_stats(struct crs_data *data); +void crs_xxt_free(struct crs_data *data); + +struct crs_data *crs_amg_setup( + uint n, const ulong *id, + uint nz, const uint *Ai, const uint *Aj, const double *A, + uint null_space, const struct comm *comm); +void crs_amg_solve(double *x, struct crs_data *data, double *b); +void crs_amg_stats(struct crs_data *data); +void crs_amg_free(struct crs_data *data); + +#endif diff --git a/3rdParty/gslib.github/src/crystal.c b/3rdParty/gslib/src/crystal.c similarity index 100% rename from 3rdParty/gslib.github/src/crystal.c rename to 3rdParty/gslib/src/crystal.c diff --git a/3rdParty/gslib.github/src/crystal.h b/3rdParty/gslib/src/crystal.h similarity index 100% rename from 3rdParty/gslib.github/src/crystal.h rename to 3rdParty/gslib/src/crystal.h diff --git a/3rdParty/gslib.github/src/fail.c b/3rdParty/gslib/src/fail.c similarity index 78% rename from 3rdParty/gslib.github/src/fail.c rename to 3rdParty/gslib/src/fail.c index 5c5d32d10..4289a2ea0 100644 --- a/3rdParty/gslib.github/src/fail.c +++ b/3rdParty/gslib/src/fail.c @@ -7,15 +7,24 @@ #include "types.h" #include "comm.h" -#define nek_exitt FORTRAN_UNPREFIXED(exitt,EXITT) +#ifdef USE_USR_EXIT +#define userExitHandler FORTRAN_NAME(userexithandler,USEREXITHANDLER) +#define USEREXIT 1 +extern void userExitHandler(int status); +#else +#define USEREXIT 0 +void userExitHandler(int status) {}; +#endif + void die(int status) { -#ifdef NO_NEK_EXITT - if(comm_gbl_id==0) exit(status); else for(;;) ; -#else - //*nek_exitt(); - exit(1); -#endif + if (USEREXIT) { + userExitHandler(status); + while(1); + } else { + exit(status); + while(1); + } } void vdiagnostic(const char *prefix, const char *file, unsigned line, diff --git a/3rdParty/gslib.github/src/fail.h b/3rdParty/gslib/src/fail.h similarity index 100% rename from 3rdParty/gslib.github/src/fail.h rename to 3rdParty/gslib/src/fail.h diff --git a/3rdParty/gslib.github/src/fcrystal.c b/3rdParty/gslib/src/fcrystal.c similarity index 100% rename from 3rdParty/gslib.github/src/fcrystal.c rename to 3rdParty/gslib/src/fcrystal.c diff --git a/3rdParty/gslib.github/src/findpts.c b/3rdParty/gslib/src/findpts.c similarity index 87% rename from 3rdParty/gslib.github/src/findpts.c rename to 3rdParty/gslib/src/findpts.c index 9f71c20da..1ed472f36 100644 --- a/3rdParty/gslib.github/src/findpts.c +++ b/3rdParty/gslib/src/findpts.c @@ -187,12 +187,24 @@ static uint count_bits(unsigned char *p, uint n) or x(2) for ym, x(3) for zm + -------------------------------------------------------------------------- + call findpts_eval_local(h, + out_base, out_stride, + el_base, el_stride, + r_base, r_stride, npt, + input_field) + + just like findpts_eval, but does assumes all points are local, + and does no communication. will use matrix-matrix products if + points are grouped by element. + --------------------------------------------------------------------------*/ -#define ffindpts_setup FORTRAN_NAME(findpts_setup,FINDPTS_SETUP) -#define ffindpts_free FORTRAN_NAME(findpts_free ,FINDPTS_FREE ) -#define ffindpts FORTRAN_NAME(findpts ,FINDPTS ) -#define ffindpts_eval FORTRAN_NAME(findpts_eval ,FINDPTS_EVAL ) +#define ffindpts_setup FORTRAN_NAME(findpts_setup ,FINDPTS_SETUP ) +#define ffindpts_free FORTRAN_NAME(findpts_free ,FINDPTS_FREE ) +#define ffindpts FORTRAN_NAME(findpts ,FINDPTS ) +#define ffindpts_eval FORTRAN_NAME(findpts_eval ,FINDPTS_EVAL ) +#define ffindpts_eval_local FORTRAN_NAME(findpts_eval_local,FINDPTS_EVAL_LOCAL) struct handle { void *data; unsigned ndim; }; static struct handle *handle_array = 0; @@ -334,3 +346,24 @@ void ffindpts_eval(const sint *const handle, r_base,(* r_stride)*sizeof(double), *npt, in, h->data); } + +void ffindpts_eval_local(const sint *const handle, + double *const out_base, const sint *const out_stride, + const sint *const el_base, const sint *const el_stride, + const double *const r_base, const sint *const r_stride, + const sint *const npt, const double *const in) +{ + CHECK_HANDLE("findpts_eval_local"); + if(h->ndim==2) + findpts_local_eval_2( + out_base,(* out_stride)*sizeof(double), + (uint*) el_base,(* el_stride)*sizeof(sint ), + r_base,(* r_stride)*sizeof(double), + *npt, in, &((struct findpts_data_2 *)h->data)->local); + else + findpts_local_eval_3( + out_base,(* out_stride)*sizeof(double), + (uint*) el_base,(* el_stride)*sizeof(sint ), + r_base,(* r_stride)*sizeof(double), + *npt, in, &((struct findpts_data_3 *)h->data)->local); +} diff --git a/3rdParty/gslib.github/src/findpts.h b/3rdParty/gslib/src/findpts.h similarity index 100% rename from 3rdParty/gslib.github/src/findpts.h rename to 3rdParty/gslib/src/findpts.h diff --git a/3rdParty/gslib.github/src/findpts_el.h b/3rdParty/gslib/src/findpts_el.h similarity index 100% rename from 3rdParty/gslib.github/src/findpts_el.h rename to 3rdParty/gslib/src/findpts_el.h diff --git a/3rdParty/gslib.github/src/findpts_el_2.c b/3rdParty/gslib/src/findpts_el_2.c similarity index 100% rename from 3rdParty/gslib.github/src/findpts_el_2.c rename to 3rdParty/gslib/src/findpts_el_2.c diff --git a/3rdParty/gslib.github/src/findpts_el_3.c b/3rdParty/gslib/src/findpts_el_3.c similarity index 100% rename from 3rdParty/gslib.github/src/findpts_el_3.c rename to 3rdParty/gslib/src/findpts_el_3.c diff --git a/3rdParty/gslib.github/src/findpts_imp.h b/3rdParty/gslib/src/findpts_imp.h similarity index 100% rename from 3rdParty/gslib.github/src/findpts_imp.h rename to 3rdParty/gslib/src/findpts_imp.h diff --git a/3rdParty/gslib.github/src/findpts_local.c b/3rdParty/gslib/src/findpts_local.c similarity index 100% rename from 3rdParty/gslib.github/src/findpts_local.c rename to 3rdParty/gslib/src/findpts_local.c diff --git a/3rdParty/gslib.github/src/findpts_local.h b/3rdParty/gslib/src/findpts_local.h similarity index 100% rename from 3rdParty/gslib.github/src/findpts_local.h rename to 3rdParty/gslib/src/findpts_local.h diff --git a/3rdParty/gslib.github/src/findpts_local_imp.h b/3rdParty/gslib/src/findpts_local_imp.h similarity index 100% rename from 3rdParty/gslib.github/src/findpts_local_imp.h rename to 3rdParty/gslib/src/findpts_local_imp.h diff --git a/3rdParty/gslib.github/src/gen_poly_imp.c b/3rdParty/gslib/src/gen_poly_imp.c similarity index 100% rename from 3rdParty/gslib.github/src/gen_poly_imp.c rename to 3rdParty/gslib/src/gen_poly_imp.c diff --git a/3rdParty/gslib.github/src/gs.c b/3rdParty/gslib/src/gs.c similarity index 71% rename from 3rdParty/gslib.github/src/gs.c rename to 3rdParty/gslib/src/gs.c index ffdc04bb6..7b91607ca 100644 --- a/3rdParty/gslib.github/src/gs.c +++ b/3rdParty/gslib/src/gs.c @@ -2,9 +2,7 @@ #include #include #include -#ifdef _OPENACC -#include -#endif +#include #include "c99.h" #include "name.h" #include "fail.h" @@ -24,16 +22,17 @@ #define gs PREFIXED_NAME(gs ) #define gs_vec PREFIXED_NAME(gs_vec ) #define gs_many PREFIXED_NAME(gs_many ) +#define igs PREFIXED_NAME(igs ) +#define igs_vec PREFIXED_NAME(igs_vec ) +#define igs_many PREFIXED_NAME(igs_many ) +#define gs_wait PREFIXED_NAME(gs_wait ) #define gs_setup PREFIXED_NAME(gs_setup ) #define gs_free PREFIXED_NAME(gs_free ) #define gs_unique PREFIXED_NAME(gs_unique) +#define gs_hf2c PREFIXED_NAME(gs_hf2c ) GS_DEFINE_DOM_SIZES() -/* Function prototypes */ -void gs_flatmap_setup(const uint *map, int **mapf, int *mf_nt, int *m_size); -static int map_size(const uint *map, int *t); -static int fp_map_size(const uint *map); typedef enum { mode_plain, mode_vec, mode_many, mode_dry_run } gs_mode; @@ -41,14 +40,12 @@ static buffer static_buffer = null_buffer; static void gather_noop( void *out, const void *in, const unsigned vn, - const uint *map, gs_dom dom, gs_op op, int dstride, - int mf_nt, int *mapf) + const uint *map, gs_dom dom, gs_op op) {} static void scatter_noop( void *out, const void *in, const unsigned vn, - const uint *map, gs_dom dom, int dstride, int mf_nt, - int *mapf) + const uint *map, gs_dom dom) {} static void init_noop( @@ -56,16 +53,13 @@ static void init_noop( const uint *map, gs_dom dom, gs_op op) {} - - - /*------------------------------------------------------------------------------ Topology Discovery ------------------------------------------------------------------------------*/ struct gs_topology { ulong total_shared; /* number of globally unique shared ids */ - struct array nz; /* array of nonzero_id's, grouped by id, + struct array nz; /* array of nonzero_id's, grouped by id, sorted by primary index, then flag, then index */ struct array sh; /* array of shared_id's, arbitrary ordering */ struct array pr; /* array of primary_shared_id's */ @@ -97,8 +91,8 @@ struct nonzero_id { static void nonzero_ids(struct array *nz, const slong *id, const uint n, buffer *buf) { - ulong last_id = -(ulong)1; - uint i, primary = -(uint)1; + ulong last_id = ULONG_MAX; + uint i, primary = UINT_MAX; struct nonzero_id *row, *end; array_init(struct nonzero_id,nz,n), end=row=nz->ptr; for(i=0;iptr,wa->n, id,1, buf); array_init(struct shared_id,sh,wa->n), sh->n=wa->n, s=sh->ptr; @@ -201,7 +195,7 @@ static ulong shared_ids(struct array *sh, struct array *pr, const struct array *nz, struct crystal *cr) { struct array un; struct unique_id *un_row, *un_end, *other; - ulong last_id = -(ulong)1; + ulong last_id = ULONG_MAX; ulong ordinal[2], n_shared=0, scan_buf[2]; struct array wa; struct shared_id_work *w; uint n_unique; @@ -225,7 +219,7 @@ static ulong shared_ids(struct array *sh, struct array *pr, and ordinal[0] of those are seen by work procs of lower rank; i.e., this work processor sees the range ordinal[0] + (0,n_shared-1) */ /* construct list of shared ids */ - last_id = -(ulong)1; + last_id = ULONG_MAX; array_init(struct shared_id_work,&wa,un.n), wa.n=0, w=wa.ptr; for(un_row=un.ptr,un_end=un_row+un.n;un_row!=un_end;++un_row) { ulong id = un_row->id; @@ -282,10 +276,10 @@ static void make_topology_unique(struct gs_topology *top, slong *id, sarray_sort(struct nonzero_id,nz->ptr,nz->n, primary,0, buf); /* assign owner among shared primaries */ - + /* create sentinel with i = -1 */ array_reserve(struct shared_id,sh,sh->n+1); - ((struct shared_id*)sh->ptr)[sh->n].i = -(uint)1; + ((struct shared_id*)sh->ptr)[sh->n].i = UINT_MAX; /* in the sorted list of procs sharing a given id, the owner is chosen to be the j^th unflagged proc, where j = id mod (length of list) */ @@ -318,7 +312,7 @@ static void make_topology_unique(struct gs_topology *top, slong *id, sh->n = out - ((struct shared_id*)sh->ptr); /* set primary_shared_id flags to match */ - ((struct shared_id*)sh->ptr)[sh->n].i = -(uint)1; + ((struct shared_id*)sh->ptr)[sh->n].i = UINT_MAX; sarray_sort(struct shared_id,sh->ptr,sh->n, id,1, buf); sarray_sort(struct primary_shared_id,pr->ptr,pr->n, id,1, buf); q=pr->ptr; @@ -357,12 +351,12 @@ static const uint *local_map(const struct array *nz, const int ignore_flagged, *p++ = row->i; \ for(other=row+1;other!=end&&other->id==row_id&&cond;++other) \ any=1, *p++ = other->i; \ - if(any) *p++ = -(uint)1; else --p; \ + if(any) *p++ = UINT_MAX; else --p; \ row=other; \ } while(0) if(ignore_flagged) DO_SET(other->flag==0); else DO_SET(1); #undef DO_SET - *p = -(uint)1; + *p = UINT_MAX; return map; } @@ -375,7 +369,7 @@ static const uint *flagged_primaries_map(const struct array *nz, uint *mem_size) p = map = tmalloc(uint,count); *mem_size += count*sizeof(uint); for(row=nz->ptr,end=row+nz->n;row!=end;++row) if(row->i==row->primary && row->flag==1) *p++ = row->i; - *p = -(uint)1; + *p = UINT_MAX; return map; } @@ -385,19 +379,21 @@ static const uint *flagged_primaries_map(const struct array *nz, uint *mem_size) typedef void exec_fun( void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, - unsigned transpose, const void *execdata, const struct comm *comm, char *buf, - int dstride, int acc, int bufSize); + unsigned transpose, const void *execdata, const struct comm *comm, char *buf); typedef void fin_fun(void *data); struct gs_remote { uint buffer_size, mem_size; void *data; exec_fun *exec; + exec_fun *exec_irecv; + exec_fun *exec_isend; + exec_fun *exec_wait; fin_fun *fin; }; typedef void setup_fun(struct gs_remote *r, struct gs_topology *top, - const struct comm *comm, buffer *buf,int dstride); + const struct comm *comm, buffer *buf); /*------------------------------------------------------------------------------ Pairwise Execution @@ -412,9 +408,6 @@ struct pw_comm_data { struct pw_data { struct pw_comm_data comm[2]; const uint *map[2]; - int *mapf[2]; - int mf_nt[2]; - int mf_size[2]; comm_req *req; uint buffer_size; }; @@ -424,14 +417,8 @@ static char *pw_exec_recvs(char *buf, const unsigned unit_size, const struct pw_comm_data *c, comm_req *req) { const uint *p, *pe, *size=c->size; -#ifdef GPUDIRECT -#pragma data present(buf) -#endif for(p=c->p,pe=p+c->n;p!=pe;++p) { size_t len = *(size++)*unit_size; -#ifdef GPUDIRECT -#pragma host_data use_device(buf) -#endif comm_irecv(req++,comm,buf,len,*p,*p); buf += len; } @@ -443,14 +430,8 @@ static char *pw_exec_sends(char *buf, const unsigned unit_size, const struct pw_comm_data *c, comm_req *req) { const uint *p, *pe, *size=c->size; -#ifdef GPUDIRECT -#pragma data present(buf) -#endif for(p=c->p,pe=p+c->n;p!=pe;++p) { size_t len = *(size++)*unit_size; -#ifdef GPUDIRECT -#pragma host_data use_device(buf) -#endif comm_isend(req++,comm,buf,len,*p,comm->id); buf += len; } @@ -459,8 +440,7 @@ static char *pw_exec_sends(char *buf, const unsigned unit_size, static void pw_exec( void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, - unsigned transpose, const void *execdata, const struct comm *comm, - char *buf,int dstride,int acc,int bufSize) + unsigned transpose, const void *execdata, const struct comm *comm, char *buf) { const struct pw_data *pwd = execdata; static gs_scatter_fun *const scatter_to_buf[] = @@ -469,35 +449,72 @@ static void pw_exec( { &gs_gather, &gs_gather_vec, &gs_gather_vec_to_many, &gather_noop }; const unsigned recv = 0^transpose, send = 1^transpose; unsigned unit_size = vn*gs_dom_size[dom]; - int i; char *sendbuf; - -/* post receives */ - // printf("r:pwe: %d %lX %lX %d:\n",pwd->comm[recv].n,(pwd->comm[recv].p),(pwd->comm[recv].size),pwd->comm[recv].total); - //printf("s:pwe: %d %lX %lX %d:\n",pwd->comm[send].n,(pwd->comm[send].p),(pwd->comm[send].size),pwd->comm[send].total); + /* post receives */ sendbuf = pw_exec_recvs(buf,unit_size,comm,&pwd->comm[recv],pwd->req); - /* fill send buffer */ - // printf("mode: %d\n",mode); - scatter_to_buf[mode](sendbuf,data,vn,pwd->map[send],dom,dstride,pwd->mf_nt[send], - pwd->mapf[send],pwd->mf_size[send],acc); + scatter_to_buf[mode](sendbuf,data,vn,pwd->map[send],dom); + /* post sends */ + pw_exec_sends(sendbuf,unit_size,comm,&pwd->comm[send], + &pwd->req[pwd->comm[recv].n]); + comm_wait(pwd->req,pwd->comm[0].n+pwd->comm[1].n); + /* gather using recv buffer */ + gather_from_buf[mode](data,buf,vn,pwd->map[recv],dom,op); +} - double* t = data; +/*------------------------------------------------------------------------------ + Nonblocking Pairwise Execution +------------------------------------------------------------------------------*/ +static void pw_exec_irecv( + void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, + unsigned transpose, const void *execdata, const struct comm *comm, char *buf) +{ + const struct pw_data *pwd = execdata; + static gs_scatter_fun *const scatter_to_buf[] = + { &gs_scatter, &gs_scatter_vec, &gs_scatter_many_to_vec, &scatter_noop }; + static gs_gather_fun *const gather_from_buf[] = + { &gs_gather, &gs_gather_vec, &gs_gather_vec_to_many, &gather_noop }; + const unsigned recv = 0^transpose, send = 1^transpose; + unsigned unit_size = vn*gs_dom_size[dom]; + /* post receives */ + char *sendbuf = pw_exec_recvs(buf,unit_size,comm,&pwd->comm[recv],pwd->req); +} -#pragma acc update host(sendbuf[0:unit_size*bufSize/2]) if(acc) +static void pw_exec_isend( + void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, + unsigned transpose, const void *execdata, const struct comm *comm, char *buf) +{ + const struct pw_data *pwd = execdata; + static gs_scatter_fun *const scatter_to_buf[] = + { &gs_scatter, &gs_scatter_vec, &gs_scatter_many_to_vec, &scatter_noop }; + static gs_gather_fun *const gather_from_buf[] = + { &gs_gather, &gs_gather_vec, &gs_gather_vec_to_many, &gather_noop }; + const unsigned recv = 0^transpose, send = 1^transpose; + unsigned unit_size = vn*gs_dom_size[dom]; + /* fill send buffer */ + char *sendbuf = buf+unit_size*pwd->comm[recv].total; + scatter_to_buf[mode](sendbuf,data,vn,pwd->map[send],dom); /* post sends */ pw_exec_sends(sendbuf,unit_size,comm,&pwd->comm[send], &pwd->req[pwd->comm[recv].n]); - comm_wait(pwd->req,pwd->comm[0].n+pwd->comm[1].n); - -#pragma acc update device(buf[0:unit_size*bufSize/2]) if(acc) +} -//#pragma update device(pwd->map[recv],pwd->mapf[recv]) +static void pw_exec_wait( + void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, + unsigned transpose, const void *execdata, const struct comm *comm, char *buf) +{ + const struct pw_data *pwd = execdata; + static gs_scatter_fun *const scatter_to_buf[] = + { &gs_scatter, &gs_scatter_vec, &gs_scatter_many_to_vec, &scatter_noop }; + static gs_gather_fun *const gather_from_buf[] = + { &gs_gather, &gs_gather_vec, &gs_gather_vec_to_many, &gather_noop }; + const unsigned recv = 0^transpose, send = 1^transpose; + unsigned unit_size = vn*gs_dom_size[dom]; + comm_wait(pwd->req,pwd->comm[0].n+pwd->comm[1].n); /* gather using recv buffer */ - gather_from_buf[mode](data,buf,vn,pwd->map[recv],dom,op,dstride,pwd->mf_nt[recv], - pwd->mapf[recv],pwd->mf_size[recv],acc); + gather_from_buf[mode](data,buf,vn,pwd->map[recv],dom,op); } /*------------------------------------------------------------------------------ @@ -506,13 +523,13 @@ static void pw_exec( static uint pw_comm_setup(struct pw_comm_data *data, struct array *sh, const unsigned flags_mask, buffer *buf) { - uint n=0,count=0, lp=-(uint)1, mem_size=0; + uint n=0,count=0, lp=UINT_MAX, mem_size=0; struct shared_id *s, *se; /* sort by remote processor and id (a globally consistent ordering) */ sarray_sort_2(struct shared_id,sh->ptr,sh->n, p,0, id,1, buf); /* assign index into buffer */ for(s=sh->ptr,se=s+sh->n;s!=se;++s) { - if(s->flags&flags_mask) { s->bi = -(uint)1; continue; } + if(s->flags&flags_mask) { s->bi = UINT_MAX; continue; } s->bi = count++; if(s->p!=lp) lp=s->p, ++n; } @@ -520,7 +537,7 @@ static uint pw_comm_setup(struct pw_comm_data *data, struct array *sh, data->p = tmalloc(uint,2*n); mem_size+=2*n*sizeof(uint); data->size = data->p + n; data->total = count; - n = 0, lp=-(uint)1; + n = 0, lp=UINT_MAX; for(s=sh->ptr,se=s+sh->n;s!=se;++s) { if(s->flags&flags_mask) continue; if(s->p!=lp) { @@ -546,20 +563,20 @@ static const uint *pw_map_setup(struct array *sh, buffer *buf, uint *mem_size) count=1; for(s=sh->ptr,se=s+sh->n;s!=se;) { uint i=s->i; - if(s->bi==-(uint)1) { ++s; continue; } + if(s->bi==UINT_MAX) { ++s; continue; } count+=3; - for(++s;s!=se&&s->i==i;++s) if(s->bi!=-(uint)1) ++count; + for(++s;s!=se&&s->i==i;++s) if(s->bi!=UINT_MAX) ++count; } /* write map */ p = map = tmalloc(uint,count); *mem_size += count*sizeof(uint); for(s=sh->ptr,se=s+sh->n;s!=se;) { uint i=s->i; - if(s->bi==-(uint)1) { ++s; continue; } + if(s->bi==UINT_MAX) { ++s; continue; } *p++ = i, *p++ = s->bi; - for(++s;s!=se&&s->i==i;++s) if(s->bi!=-(uint)1) *p++ = s->bi; - *p++ = -(uint)1; + for(++s;s!=se&&s->i==i;++s) if(s->bi!=UINT_MAX) *p++ = s->bi; + *p++ = UINT_MAX; } - *p = -(uint)1; + *p = UINT_MAX; return map; } @@ -568,35 +585,25 @@ static struct pw_data *pw_setup_aux(struct array *sh, buffer *buf, { struct pw_data *pwd = tmalloc(struct pw_data,1); *mem_size = sizeof(struct pw_data); - + /* default behavior: receive only remotely unflagged data */ *mem_size+=pw_comm_setup(&pwd->comm[0],sh, FLAGS_REMOTE, buf); pwd->map[0] = pw_map_setup(sh, buf, mem_size); - /* Get flattened map */ - gs_flatmap_setup(pwd->map[0],&(pwd->mapf[0]),&(pwd->mf_nt[0]),&(pwd->mf_size[0])); - /* default behavior: send only locally unflagged data */ *mem_size+=pw_comm_setup(&pwd->comm[1],sh, FLAGS_LOCAL, buf); pwd->map[1] = pw_map_setup(sh, buf, mem_size); - /* Get flattened map */ - gs_flatmap_setup(pwd->map[1],&(pwd->mapf[1]),&(pwd->mf_nt[1]),&(pwd->mf_size[1])); - pwd->req = tmalloc(comm_req,pwd->comm[0].n+pwd->comm[1].n); *mem_size += (pwd->comm[0].n+pwd->comm[1].n)*sizeof(comm_req); pwd->buffer_size = pwd->comm[0].total + pwd->comm[1].total; - return pwd; } static void pw_free(struct pw_data *data) { - const uint *map0 = data->map[0],*map1 = data->map[1]; - pw_comm_free(&data->comm[0]); pw_comm_free(&data->comm[1]); -#pragma acc exit data delete(map0,map1) free((uint*)data->map[0]); free((uint*)data->map[1]); free(data->req); @@ -604,23 +611,23 @@ static void pw_free(struct pw_data *data) } static void pw_setup(struct gs_remote *r, struct gs_topology *top, - const struct comm *comm, buffer *buf,int dstride) + const struct comm *comm, buffer *buf) { struct pw_data *pwd = pw_setup_aux(&top->sh,buf, &r->mem_size); r->buffer_size = pwd->buffer_size; r->data = pwd; r->exec = (exec_fun*)&pw_exec; + r->exec_irecv = (exec_fun*)&pw_exec_irecv; + r->exec_isend = (exec_fun*)&pw_exec_isend; + r->exec_wait = (exec_fun*)&pw_exec_wait; r->fin = (fin_fun*)&pw_free; - } + /*------------------------------------------------------------------------------ Crystal-Router Execution ------------------------------------------------------------------------------*/ struct cr_stage { const uint *scatter_map, *gather_map; - int *scatter_mapf, *gather_mapf; - int s_nt,g_nt; - int s_size,g_size; uint size_r, size_r1, size_r2; uint size_sk, size_s, size_total; uint p1, p2; @@ -635,8 +642,7 @@ struct cr_data { static void cr_exec( void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, - unsigned transpose, const void *execdata, const struct comm *comm, - char *buf,int dstride,int acc,int bufSize) + unsigned transpose, const void *execdata, const struct comm *comm, char *buf) { const struct cr_data *crd = execdata; static gs_scatter_fun *const scatter_user_to_buf[] = @@ -651,10 +657,8 @@ static void cr_exec( { &gs_gather, &gs_gather_vec, &gs_gather_vec, &gs_gather }; const unsigned unit_size = vn*gs_dom_size[dom], nstages=crd->nstages; unsigned k; - int id; char *sendbuf, *buf_old, *buf_new; const struct cr_stage *stage = crd->stage[transpose]; - buf_old = buf; buf_new = buf_old + unit_size*crd->stage_buffer_size; /* crystal router */ @@ -667,27 +671,19 @@ static void cr_exec( comm_irecv(&req[2],comm,buf_new+unit_size*stage[k].size_r1, unit_size*stage[k].size_r2, stage[k].p2, comm->np+k); sendbuf = buf_new+unit_size*stage[k].size_r; - // printf("%d\n",mode); if(k==0) - scatter_user_to_buf[mode](sendbuf,data,vn,stage[0].scatter_map,dom,dstride, - stage[0].s_nt,stage[0].scatter_mapf,stage[0].s_size,acc); + scatter_user_to_buf[mode](sendbuf,data,vn,stage[0].scatter_map,dom); else - scatter_buf_to_buf[mode](sendbuf,buf_old,vn,stage[k].scatter_map,dom,dstride, - stage[k].s_nt,stage[k].scatter_mapf,stage[k].s_size,acc), - gather_buf_to_buf [mode](sendbuf,buf_old,vn,stage[k].gather_map ,dom,op,dstride, - stage[k].g_nt,stage[k].gather_mapf,stage[k].g_size,acc); - //Need to update gather vec and scatter vec! -#pragma acc update host(buf[0:unit_size*bufSize]) if(acc) + scatter_buf_to_buf[mode](sendbuf,buf_old,vn,stage[k].scatter_map,dom), + gather_buf_to_buf [mode](sendbuf,buf_old,vn,stage[k].gather_map ,dom,op); + comm_isend(&req[0],comm,sendbuf,unit_size*stage[k].size_s, stage[k].p1, comm->np+k); comm_wait(&req[0],1+stage[k].nrecvn); -#pragma acc update device(buf[0:unit_size*bufSize]) if(acc) { char *t = buf_old; buf_old=buf_new; buf_new=t; } } - scatter_buf_to_user[mode](data,buf_old,vn,stage[k].scatter_map,dom,dstride, - stage[k].s_nt,stage[k].scatter_mapf,stage[k].s_size,acc); - gather_buf_to_user [mode](data,buf_old,vn,stage[k].gather_map ,dom,op,dstride, - stage[k].g_nt,stage[k].gather_mapf,stage[k].g_size,acc); + scatter_buf_to_user[mode](data,buf_old,vn,stage[k].scatter_map,dom); + gather_buf_to_user [mode](data,buf_old,vn,stage[k].gather_map ,dom,op); } /*------------------------------------------------------------------------------ @@ -733,7 +729,7 @@ static void crl_work_init(struct array *cw, struct array *sh, const unsigned send_mask, uint this_p) { const unsigned recv_mask = send_mask^(FLAGS_REMOTE|FLAGS_LOCAL); - uint last_i=-(uint)1; int added_myself; + uint last_i=UINT_MAX; int added_myself; uint cw_n = 0, cw_max = cw->max; struct crl_id *w = cw->ptr; struct shared_id *s, *se; @@ -745,7 +741,7 @@ static void crl_work_init(struct array *cw, struct array *sh, w->id=aid, w->p=ap, w->ri=ari, w->si=asi; \ ++w, ++cw_n; \ } while(0) - + for(s=sh->ptr,se=s+sh->n;s!=se;++s) { int send = (s->flags&send_mask)==0; int recv = (s->flags&recv_mask)==0; @@ -757,7 +753,7 @@ static void crl_work_init(struct array *cw, struct array *sh, if(send) CW_ADD(s->id,s->p,s->ri,s->i); } cw->n=cw_n; -#undef CW_ADD +#undef CW_ADD } static uint crl_maps(struct cr_stage *stage, struct array *cw, buffer *buf) @@ -782,10 +778,10 @@ static uint crl_maps(struct cr_stage *stage, struct array *cw, buffer *buf) *gp++ = bi; for(other=w+1;other!=we&&other->bi==bi;++other) if(other->si!=si) si=other->si, any=1, *gp++ = si; - if(any) *gp++ = -(uint)1; else --gp; - *sp++ = -(uint)1; + if(any) *gp++ = UINT_MAX; else --gp; + *sp++ = UINT_MAX; } - *sp=-(uint)1, *gp=-(uint)1; + *sp=UINT_MAX, *gp=UINT_MAX; return mem_size; } @@ -853,12 +849,12 @@ static uint cr_learn(struct array *cw, struct cr_stage *stage, stage->p2,tag); comm_isend(&req[0],comm,nsend,2*sizeof(uint),stage->p1,tag); comm_wait(req,1+stage->nrecvn),++tag; - + stage->size_r1 = nrecv[0][1], stage->size_r2 = nrecv[1][1]; stage->size_r = stage->size_r1 + stage->size_r2; stage->size_total = stage->size_r + stage->size_sk; if(stage->size_total>size_max) size_max=stage->size_total; - + array_reserve(struct crl_id,cw,cw->n+nrecv[0][0]+nrecv[1][0]); wrecv[0] = cw->ptr, wrecv[0] += cw->n, wrecv[1] = wrecv[0]+nrecv[0][0]; wsend = cw->ptr, wsend += nkeep; @@ -878,7 +874,7 @@ static uint cr_learn(struct array *cw, struct cr_stage *stage, memmove(wsend,wrecv[0],(nrecv[0][0]+nrecv[1][0])*sizeof(struct crl_id)); cw->n += nrecv[0][0] + nrecv[1][0]; cw->n -= nsend[0]; - + if(idptr,sh->n, i,0, buf); @@ -905,86 +901,23 @@ static struct cr_data *cr_setup_aux( size_max[0]=cr_learn(&cw,crd->stage[0],comm,buf, mem_size); crl_work_init(&cw,sh, FLAGS_REMOTE, comm->id); size_max[1]=cr_learn(&cw,crd->stage[1],comm,buf, mem_size); - + crd->stage_buffer_size = size_max[1]>size_max[0]?size_max[1]:size_max[0]; array_free(&cw); - - crd->buffer_size = 2*crd->stage_buffer_size; - /* Get the flat maps for the CR */ - const struct cr_stage *stage = crd->stage[0]; - for(i=0;instages;i++){ - if(i==0){ - gs_flatmap_setup(stage[0].scatter_map,(int**)&(stage[0].scatter_mapf),(int*)&(stage[0].s_nt), - (int*)&(stage[0].s_size)); -//#pragma acc enter data copyin(stage[0].scatter_map[0:stage[0].s_size],stage[0].scatter_mapf[0:stage[0].s_nt]) - } else { - gs_flatmap_setup(stage[i].scatter_map,(int**)&(stage[i].scatter_mapf),(int*)&(stage[i].s_nt), - (int*)&(stage[i].s_size)); - gs_flatmap_setup(stage[i].gather_map,(int**)&(stage[i].gather_mapf),(int*)&(stage[i].g_nt), - (int*)&(stage[i].g_size)); -//#pragma acc enter data copyin(stage[i].scatter_map[i:stage[i].s_size],stage[i].scatter_mapf[i:stage[i].s_nt]) -//#pragma acc enter data copyin(stage[i].gather_map[i:stage[i].g_size],stage[i].gather_mapf[i:stage[i].g_nt]) - } - } - gs_flatmap_setup(stage[i].scatter_map,(int**)&(stage[i].scatter_mapf),(int*)&(stage[i].s_nt), - (int*)&(stage[i].s_size)); - gs_flatmap_setup(stage[i].gather_map,(int**)&(stage[i].gather_mapf),(int*)&(stage[i].g_nt), - (int*)&(stage[i].g_size)); -//#pragma acc enter data copyin(stage[i].scatter_map[i:stage[i].s_size],stage[i].scatter_mapf[i:stage[i].s_nt]) -//#pragma acc enter data copyin(stage[i].gather_map[i:stage[i].g_size],stage[i].gather_mapf[i:stage[i].g_nt]) - - - const struct cr_stage *stage2 = crd->stage[1]; - for(i=0;instages;i++){ - if(i==0){ - gs_flatmap_setup(stage2[0].scatter_map,(int**)&(stage2[0].scatter_mapf), - (int*)&(stage2[0].s_nt),(int*)&(stage2[0].s_size)); -//#pragma acc enter data copyin(stage2[0].scatter_map[0:stage2[0].s_size],stage2[0].scatter_mapf[0:stage2[0].s_nt]) - } else { - gs_flatmap_setup(stage2[i].scatter_map,(int**)&(stage2[i].scatter_mapf), - (int*)&(stage2[i].s_nt),(int*)&(stage2[i].s_size)); - gs_flatmap_setup(stage2[i].gather_map,(int**)&(stage2[i].gather_mapf), - (int*)&(stage2[i].g_nt),(int*)&(stage2[i].g_size)); -//#pragma acc enter data copyin(stage2[i].scatter_map[i:stage2[i].s_size],stage2[i].scatter_mapf[i:stage2[i].s_nt]) -//#pragma acc enter data copyin(stage2[i].gather_map[i:stage2[i].g_size],stage2[i].gather_mapf[i:stage2[i].g_nt]) - } - } - gs_flatmap_setup(stage2[i].scatter_map,(int**)&(stage2[i].scatter_mapf), - (int*)&(stage2[i].s_nt),(int*)&(stage2[i].s_size)); - gs_flatmap_setup(stage2[i].gather_map,(int**)&(stage2[i].gather_mapf), - (int*)&(stage2[i].g_nt),(int*)&(stage2[i].g_size)); -//#pragma acc enter data copyin(stage2[i].scatter_map[i:stage2[i].s_size],stage2[i].scatter_mapf[i:stage2[i].s_nt]) -//#pragma acc enter data copyin(stage2[i].gather_map[i:stage2[i].g_size],stage2[i].gather_mapf[i:stage2[i].g_nt]) + crd->buffer_size = 2*crd->stage_buffer_size; return crd; } static void cr_free_stage_maps(struct cr_stage *stage, unsigned kmax) { unsigned k; - int *map,*mapf; for(k=0; kscatter_map; - mapf = stage->scatter_mapf; -#pragma acc exit data delete(map,mapf) - if(k!=0) { - map = stage->gather_map; - mapf = stage->gather_mapf; -#pragma acc exit data delete(map,mapf) - } free((uint*)stage->scatter_map); - free((uint*)stage->scatter_mapf); ++stage; } - map = stage->scatter_map; - mapf = stage->scatter_mapf; -#pragma acc exit data delete(map,mapf) - map = stage->gather_map; - mapf = stage->gather_mapf; -#pragma acc exit data delete(map,mapf) free((uint*)stage->scatter_map); - free((uint*)stage->scatter_mapf); } static void cr_free(struct cr_data *data) @@ -1010,16 +943,13 @@ static void cr_setup(struct gs_remote *r, struct gs_topology *top, ------------------------------------------------------------------------------*/ struct allreduce_data { const uint *map_to_buf[2], *map_from_buf[2]; - int *map_to_buf_f[2],*map_from_buf_f[2]; - int mt_nt[2],mf_nt[2]; - int mt_size[2],mf_size[2]; uint buffer_size; + comm_req *req; }; static void allreduce_exec( void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, - unsigned transpose, const void *execdata, const struct comm *comm, - char *buf, int dstride,int acc, int bufSize) + unsigned transpose, const void *execdata, const struct comm *comm, char *buf) { const struct allreduce_data *ard = execdata; static gs_scatter_fun *const scatter_to_buf[] = @@ -1029,26 +959,60 @@ static void allreduce_exec( uint gvn = vn*(ard->buffer_size/2); unsigned unit_size = gs_dom_size[dom]; char *ardbuf; - int id,i; ardbuf = buf+unit_size*gvn; - double *ddata = data; - /* user array -> buffer */ - gs_init_array(buf,gvn,dom,op,acc); + gs_init_array(buf,gvn,dom,op); + scatter_to_buf[mode](buf,data,vn,ard->map_to_buf[transpose],dom); + /* all reduce */ + comm_allreduce(comm,dom,op, buf,gvn, ardbuf); + /* buffer -> user array */ + scatter_from_buf[mode](data,buf,vn,ard->map_from_buf[transpose],dom); +} - scatter_to_buf[mode](buf,data,vn,ard->map_to_buf[transpose],dom,dstride, - ard->mt_nt[transpose],ard->map_to_buf_f[transpose], - ard->mt_size[transpose],acc); +/*------------------------------------------------------------------------------ + Nonblocking All-reduce Execution +------------------------------------------------------------------------------*/ +static void allreduce_exec_i( + void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, + unsigned transpose, const void *execdata, const struct comm *comm, char *buf) +{ + const struct allreduce_data *ard = execdata; + static gs_scatter_fun *const scatter_to_buf[] = + { &gs_scatter, &gs_scatter_vec, &gs_scatter_many_to_vec, &scatter_noop }; + static gs_scatter_fun *const scatter_from_buf[] = + { &gs_scatter, &gs_scatter_vec, &gs_scatter_vec_to_many, &scatter_noop }; + uint gvn = vn*(ard->buffer_size/2); + unsigned unit_size = gs_dom_size[dom]; + char *ardbuf = buf+unit_size*gvn; + /* user array -> buffer */ + gs_init_array(buf,gvn,dom,op); + scatter_to_buf[mode](buf,data,vn,ard->map_to_buf[transpose],dom); /* all reduce */ -#pragma acc update host(buf[0:vn*unit_size*bufSize]) if(acc) - comm_allreduce(comm,dom,op, buf,gvn, ardbuf); - /* buffer -> user array */ -#pragma acc update device(buf[0:vn*unit_size*bufSize]) if(acc) - scatter_from_buf[mode](data,buf,vn,ard->map_from_buf[transpose],dom,dstride, - ard->mf_nt[transpose],ard->map_from_buf_f[transpose], - ard->mf_size[transpose],acc); + comm_iallreduce(ard->req,comm,dom,op,buf,gvn,ardbuf); +} + +static void allreduce_exec_wait( + void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, + unsigned transpose, const void *execdata, const struct comm *comm, char *buf) +{ + const struct allreduce_data *ard = execdata; + static gs_scatter_fun *const scatter_to_buf[] = + { &gs_scatter, &gs_scatter_vec, &gs_scatter_many_to_vec, &scatter_noop }; + static gs_scatter_fun *const scatter_from_buf[] = + { &gs_scatter, &gs_scatter_vec, &gs_scatter_vec_to_many, &scatter_noop }; + uint gvn = vn*(ard->buffer_size/2); + unsigned unit_size = gs_dom_size[dom]; + char *ardbuf = buf+unit_size*gvn; + // Why do I need this? Ugly !!! + if (comm->np > 1) + comm_wait(ard->req, 1); +#ifdef MPI + memcpy(buf,ardbuf,gvn*gs_dom_size[dom]); +#endif + /* buffer -> user array */ + scatter_from_buf[mode](data,buf,vn,ard->map_from_buf[transpose],dom); } /*------------------------------------------------------------------------------ @@ -1065,13 +1029,13 @@ static const uint *allreduce_map_setup( if(to_buf) { for(p=pr->ptr,pe=p+pr->n;p!=pe;++p) if((p->flag&flags_mask)==0) - *m++ = p->i, *m++ = p->ord, *m++ = -(uint)1; + *m++ = p->i, *m++ = p->ord, *m++ = UINT_MAX; } else { for(p=pr->ptr,pe=p+pr->n;p!=pe;++p) if((p->flag&flags_mask)==0) - *m++ = p->ord, *m++ = p->i, *m++ = -(uint)1; + *m++ = p->ord, *m++ = p->i, *m++ = UINT_MAX; } - *m=-(uint)1; + *m=UINT_MAX; return map; } @@ -1080,29 +1044,23 @@ static struct allreduce_data *allreduce_setup_aux( { struct allreduce_data *ard = tmalloc(struct allreduce_data,1); *mem_size = sizeof(struct allreduce_data); - + /* default behavior: reduce only unflagged data, copy to all */ ard->map_to_buf [0] = allreduce_map_setup(pr,1,1, mem_size); ard->map_from_buf[0] = allreduce_map_setup(pr,0,0, mem_size); - gs_flatmap_setup(ard->map_to_buf[0],&(ard->map_to_buf_f[0]),&(ard->mt_nt[0]),&(ard->mt_size[0])); - gs_flatmap_setup(ard->map_from_buf[0],&(ard->map_from_buf_f[0]),&(ard->mf_nt[0]), - &(ard->mf_size[0])); /* transpose behavior: reduce all data, copy to unflagged */ ard->map_to_buf [1] = allreduce_map_setup(pr,0,1, mem_size); ard->map_from_buf[1] = allreduce_map_setup(pr,1,0, mem_size); - gs_flatmap_setup(ard->map_to_buf[1],&(ard->map_to_buf_f[1]),&(ard->mt_nt[1]),&(ard->mt_size[1])); - gs_flatmap_setup(ard->map_from_buf[1],&(ard->map_from_buf_f[1]),&(ard->mf_nt[1]), - &(ard->mf_size[1])); - + ard->req = tmalloc(comm_req, 1); + ard->buffer_size = total_shared*2; return ard; } static void allreduce_free(struct allreduce_data *ard) { - //#pragma acc exit data delete(ard->map_to_buf[0],ard->map_to_buf[1],ard->map_from_buf[0],ard->map_from_buf[1]) free((uint*)ard->map_to_buf[0]); free((uint*)ard->map_to_buf[1]); free((uint*)ard->map_from_buf[0]); @@ -1111,17 +1069,17 @@ static void allreduce_free(struct allreduce_data *ard) } static void allreduce_setup(struct gs_remote *r, struct gs_topology *top, - const struct comm *comm, buffer *buf,int dstride) + const struct comm *comm, buffer *buf) { struct allreduce_data *ard = allreduce_setup_aux(&top->pr,top->total_shared, &r->mem_size); r->buffer_size = ard->buffer_size; r->data = ard; r->exec = (exec_fun*)&allreduce_exec; + r->exec_irecv = (exec_fun*)&allreduce_exec_i; + r->exec_isend = NULL; + r->exec_wait = (exec_fun*)&allreduce_exec_wait; r->fin = (fin_fun*)&allreduce_free; - - //#pragma acc enter data copyin(ard->map_to_buf[0][0:ard->mt_size[0]],ard->map_from_buf[0][0:ard->mf_size[0]],ard->map_to_buf_f[0][0:ard->mt_nt[0]],ard->map_from_buf_f[0][0:ard->mf_nt[0]],ard->map_to_buf[1][0:ard->mt_size[1]],ard->map_from_buf[1][0:ard->mf_size[1]],ard->map_to_buf_f[1][0:ard->mt_nt[1]],ard->map_from_buf_f[1][0:ard->mf_nt[1]]) - } /*------------------------------------------------------------------------------ @@ -1134,11 +1092,11 @@ static void dry_run_time(double times[3], const struct gs_remote *r, int i; double t; buffer_reserve(buf,gs_dom_size[gs_double]*r->buffer_size); for(i= 2;i;--i) - r->exec(0,mode_dry_run,1,gs_double,gs_add,0,r->data,comm,buf->ptr,0,0,0); + r->exec(0,mode_dry_run,1,gs_double,gs_add,0,r->data,comm,buf->ptr); comm_barrier(comm); t = comm_time(); for(i=10;i;--i) - r->exec(0,mode_dry_run,1,gs_double,gs_add,0,r->data,comm,buf->ptr,0,0,0); + r->exec(0,mode_dry_run,1,gs_double,gs_add,0,r->data,comm,buf->ptr); t = (comm_time() - t)/10; times[0] = t/comm->np, times[1] = t, times[2] = t; comm_allreduce(comm,gs_double,gs_add, ×[0],1, &t); @@ -1147,25 +1105,22 @@ static void dry_run_time(double times[3], const struct gs_remote *r, } static void auto_setup(struct gs_remote *r, struct gs_topology *top, - const struct comm *comm, buffer *buf,int dstride) + const struct comm *comm, buffer *buf) { - pw_setup(r, top,comm,buf,dstride); + pw_setup(r, top,comm,buf); if(comm->np>1) { const char *name = "pairwise"; struct gs_remote r_alt; double time[2][3]; -#if 0 //Added to force it to use pw when OpenACC is defined - Matt Otten - 10-28-14 - if(comm->id==0) printf(" used all_to_all method ACC: %s\n",name); -#else - #define DRY_RUN(i,gsr,str) do { \ + #define DRY_RUN(i,gsr,str) do { \ if(comm->id==0) printf(" " str ": "); \ dry_run_time(time[i],gsr,comm,buf); \ if(comm->id==0) \ printf("%g %g %g\n",time[i][0],time[i][1],time[i][2]); \ } while(0) - + #define DRY_RUN_CHECK(str,new_name) do { \ DRY_RUN(1,&r_alt,str); \ if(time[1][2]total_shared<100000) { - allreduce_setup(&r_alt, top,comm,buf,dstride); + allreduce_setup(&r_alt, top,comm,buf); DRY_RUN_CHECK( "all reduce ", "allreduce"); } #undef DRY_RUN_CHECK #undef DRY_RUN - if(comm->id==1) printf(" used all_to_all method: %s\n",name); -#endif - } -} - - -void print_acc(double *a,int n){ - int i; -#pragma acc update host(a[0:n]) - for(i=0;iid==0) printf(" used all_to_all method: %s\n",name); } - printf("\n"); } /*------------------------------------------------------------------------------ @@ -1220,22 +1155,34 @@ struct gs_data { const uint *map_local[2]; /* 0=unflagged, 1=all */ const uint *flagged_primaries; struct gs_remote r; - int *map_localf[2]; - int m_size[2]; - int fp_size; - int mf_nt[2]; - int dstride; - int u_size; uint handle_size; }; +/*------------------------------------------------------------------------------ + GS_AUX - blocking and non-blocking +------------------------------------------------------------------------------*/ static void gs_aux( void *u, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, unsigned transpose, struct gs_data *gsh, buffer *buf) { - int acc, i; - char *bufPtr; + static gs_scatter_fun *const local_scatter[] = + { &gs_scatter, &gs_scatter_vec, &gs_scatter_many, &scatter_noop }; + static gs_gather_fun *const local_gather [] = + { &gs_gather, &gs_gather_vec, &gs_gather_many, &gather_noop }; + static gs_init_fun *const init[] = + { &gs_init, &gs_init_vec, &gs_init_many, &init_noop }; + if(!buf) buf = &static_buffer; + buffer_reserve(buf,vn*gs_dom_size[dom]*gsh->r.buffer_size); + local_gather [mode](u,u,vn,gsh->map_local[0^transpose],dom,op); + if(transpose==0) init[mode](u,vn,gsh->flagged_primaries,dom,op); + gsh->r.exec(u,mode,vn,dom,op,transpose,gsh->r.data,&gsh->comm,buf->ptr); + local_scatter[mode](u,u,vn,gsh->map_local[1^transpose],dom); +} +static void gs_aux_irecv( + void *u, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, unsigned transpose, + struct gs_data *gsh, buffer *buf) +{ static gs_scatter_fun *const local_scatter[] = { &gs_scatter, &gs_scatter_vec, &gs_scatter_many, &scatter_noop }; static gs_gather_fun *const local_gather [] = @@ -1243,55 +1190,189 @@ static void gs_aux( static gs_init_fun *const init[] = { &gs_init, &gs_init_vec, &gs_init_many, &init_noop }; if(!buf) buf = &static_buffer; - bufPtr = buf->ptr; -#pragma acc exit data delete(bufPtr) buffer_reserve(buf,vn*gs_dom_size[dom]*gsh->r.buffer_size); - bufPtr = buf->ptr; -#pragma acc enter data create(bufPtr[0:vn*gs_dom_size[dom]*gsh->r.buffer_size]) - acc = 0; -#ifdef _OPENACC - if(acc_is_present(u,1)) { - acc = 1; - printf("ACC IS ON\n"); - } else { - printf("ACC IS OFF\n"); - } -#endif - local_gather [mode](u,u,vn,gsh->map_local[0^transpose],dom,op,gsh->dstride, - gsh->mf_nt[0^transpose],gsh->map_localf[0^transpose], - gsh->m_size[0^transpose],acc); + local_gather [mode](u,u,vn,gsh->map_local[0^transpose],dom,op); + if(transpose==0) init[mode](u,vn,gsh->flagged_primaries,dom,op); + + if (gsh->r.exec_irecv) + gsh->r.exec_irecv(u,mode,vn,dom,op,transpose,gsh->r.data,&gsh->comm,buf->ptr); +} + +static void gs_aux_isend( + void *u, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, unsigned transpose, + struct gs_data *gsh, buffer *buf) +{ + static gs_scatter_fun *const local_scatter[] = + { &gs_scatter, &gs_scatter_vec, &gs_scatter_many, &scatter_noop }; + static gs_gather_fun *const local_gather [] = + { &gs_gather, &gs_gather_vec, &gs_gather_many, &gather_noop }; + static gs_init_fun *const init[] = + { &gs_init, &gs_init_vec, &gs_init_many, &init_noop }; + + if(!buf) buf = &static_buffer; + + if (gsh->r.exec_isend) + gsh->r.exec_isend(u,mode,vn,dom,op,transpose,gsh->r.data,&gsh->comm,buf->ptr); +} - if(transpose==0) init[mode](u,vn,gsh->flagged_primaries,dom,op,gsh->dstride, - gsh->fp_size,acc); +static void gs_aux_wait( + void *u, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, unsigned transpose, + struct gs_data *gsh, buffer *buf) +{ + static gs_scatter_fun *const local_scatter[] = + { &gs_scatter, &gs_scatter_vec, &gs_scatter_many, &scatter_noop }; + static gs_gather_fun *const local_gather [] = + { &gs_gather, &gs_gather_vec, &gs_gather_many, &gather_noop }; + static gs_init_fun *const init[] = + { &gs_init, &gs_init_vec, &gs_init_many, &init_noop }; - // printf("before exec: buf->ptr %p-%p\n",buf->ptr,buf->ptr+vn*gs_dom_size[dom]*gsh->r.buffer_size); - // printf("mode gs: %d\n",mode); - gsh->r.exec(u,mode,vn,dom,op,transpose,gsh->r.data,&gsh->comm,buf->ptr,gsh->dstride,acc,gsh->r.buffer_size); + if(!buf) buf = &static_buffer; - local_scatter[mode](u,u,vn,gsh->map_local[1^transpose],dom,gsh->dstride, - gsh->mf_nt[1^transpose],gsh->map_localf[1^transpose], - gsh->m_size[1^transpose],acc); + if (gsh->r.exec_wait) + gsh->r.exec_wait(u,mode,vn,dom,op,transpose,gsh->r.data,&gsh->comm,buf->ptr); + local_scatter[mode](u,u,vn,gsh->map_local[1^transpose],dom); } +/*------------------------------------------------------------------------------ + GS interface - blocking and non-blocking +------------------------------------------------------------------------------*/ +struct nonblocking_private { + void *u; + gs_mode mode; + gs_dom dom; + gs_op op; + unsigned transpose; + struct gs_data *gsh; + buffer *buf; + unsigned vn; +}; + +typedef struct nonblocking_private* nblkng; + +static nblkng *nblkng_dict; +static int nblkng_max = 0; +static int nblkng_n = 0; +static int nblkng_count = 0; + void gs(void *u, gs_dom dom, gs_op op, unsigned transpose, struct gs_data *gsh, buffer *buf) { gs_aux(u,mode_plain,1,dom,op,transpose,gsh,buf); } +void igs(void *u, gs_dom dom, gs_op op, unsigned transpose, + struct gs_data *gsh, buffer *buf, int *handle) +{ + if(nblkng_n==nblkng_max) nblkng_max+=nblkng_max/2+1, + nblkng_dict=trealloc(nblkng,nblkng_dict,nblkng_max); + + nblkng_dict[nblkng_n] = tmalloc(struct nonblocking_private, 1); + + nblkng_dict[nblkng_n]->u = u; + nblkng_dict[nblkng_n]->dom = dom; + nblkng_dict[nblkng_n]->op = op; + nblkng_dict[nblkng_n]->transpose = transpose; + nblkng_dict[nblkng_n]->gsh = gsh; + nblkng_dict[nblkng_n]->buf = buf; + nblkng_dict[nblkng_n]->mode = mode_plain; + nblkng_dict[nblkng_n]->vn = 1; + + *handle = nblkng_n++; + nblkng_count++; + + gs_aux_irecv(u,mode_plain,1,dom,op,transpose,gsh,buf); + gs_aux_isend(u,mode_plain,1,dom,op,transpose,gsh,buf); +} + +void gs_wait(int handle) +{ + if(handle < nblkng_n) { + gs_aux_wait(nblkng_dict[handle]->u, + nblkng_dict[handle]->mode, + nblkng_dict[handle]->vn, + nblkng_dict[handle]->dom, + nblkng_dict[handle]->op, + nblkng_dict[handle]->transpose, + nblkng_dict[handle]->gsh, + nblkng_dict[handle]->buf); + free(nblkng_dict[handle]); + nblkng_dict[handle] = 0; + nblkng_count--; + } + + if(nblkng_count == 0) { + free(nblkng_dict); + nblkng_dict = 0; + nblkng_max = 0; + nblkng_n = 0; + } +} +/*------------------------------------------------------------------------------ + GS_VEC interface - blocking and non-blocking +------------------------------------------------------------------------------*/ void gs_vec(void *u, unsigned vn, gs_dom dom, gs_op op, unsigned transpose, struct gs_data *gsh, buffer *buf) { gs_aux(u,mode_vec,vn,dom,op,transpose,gsh,buf); } -void gs_many(void *u, unsigned vn, gs_dom dom, gs_op op, +void igs_vec(void *u, unsigned vn, gs_dom dom, gs_op op, + unsigned transpose, struct gs_data *gsh, buffer *buf, int *handle) +{ + if(nblkng_n==nblkng_max) nblkng_max+=nblkng_max/2+1, + nblkng_dict=trealloc(nblkng,nblkng_dict,nblkng_max); + + nblkng_dict[nblkng_n] = tmalloc(struct nonblocking_private, 1); + + nblkng_dict[nblkng_n]->u = u; + nblkng_dict[nblkng_n]->dom = dom; + nblkng_dict[nblkng_n]->op = op; + nblkng_dict[nblkng_n]->transpose = transpose; + nblkng_dict[nblkng_n]->gsh = gsh; + nblkng_dict[nblkng_n]->buf = buf; + nblkng_dict[nblkng_n]->vn = vn; + nblkng_dict[nblkng_n]->mode = mode_vec; + + *handle = nblkng_n++; + nblkng_count++; + + gs_aux_irecv(u,mode_vec,vn,dom,op,transpose,gsh,buf); + gs_aux_isend(u,mode_vec,vn,dom,op,transpose,gsh,buf); +} +/*------------------------------------------------------------------------------ + GS_MANY interface - blocking and non-blocking +------------------------------------------------------------------------------*/ +void gs_many(void *const*u, unsigned vn, gs_dom dom, gs_op op, unsigned transpose, struct gs_data *gsh, buffer *buf) { gs_aux((void*)u,mode_many,vn,dom,op,transpose,gsh,buf); } +void igs_many(void *const*u, unsigned vn, gs_dom dom, gs_op op, + unsigned transpose, struct gs_data *gsh, buffer *buf, int *handle) +{ + if(nblkng_n==nblkng_max) nblkng_max+=nblkng_max/2+1, + nblkng_dict=trealloc(nblkng,nblkng_dict,nblkng_max); + + nblkng_dict[nblkng_n] = tmalloc(struct nonblocking_private, 1); + + nblkng_dict[nblkng_n]->u = (void *)u; + nblkng_dict[nblkng_n]->dom = dom; + nblkng_dict[nblkng_n]->op = op; + nblkng_dict[nblkng_n]->transpose = transpose; + nblkng_dict[nblkng_n]->gsh = gsh; + nblkng_dict[nblkng_n]->buf = buf; + nblkng_dict[nblkng_n]->vn = vn; + nblkng_dict[nblkng_n]->mode = mode_many; + + *handle = nblkng_n++; + nblkng_count++; + + gs_aux_irecv((void *)u,mode_many,vn,dom,op,transpose,gsh,buf); + gs_aux_isend((void *)u,mode_many,vn,dom,op,transpose,gsh,buf); +} + /*------------------------------------------------------------------------------ Main Setup ------------------------------------------------------------------------------*/ @@ -1299,30 +1380,10 @@ typedef enum {gs_auto, gs_pairwise, gs_crystal_router, gs_all_reduce} gs_method; static uint local_setup(struct gs_data *gsh, const struct array *nz) { - uint mem_size = 0,s=0,i; - int mf_temp; - char hname[1024]; - - // gethostname(hname, sizeof(hname)); - - s = 0; - gsh->map_local[0] = local_map(nz,1, &s); - gs_flatmap_setup(gsh->map_local[0],&(gsh->map_localf[0]),&(gsh->mf_nt[0]),&(gsh->m_size[0])); - - - mem_size += s; - //fprintf(stderr,"%s: map[0:%d] -> %lX : %lX\n",hname,s/4,gsh->map_local[0],((void*)gsh->map_local[0])+s); - s = 0; - gsh->map_local[1] = local_map(nz,0, &s); - gs_flatmap_setup(gsh->map_local[1],&(gsh->map_localf[1]),&(gsh->mf_nt[1]),&(gsh->m_size[1])); - mem_size += s; - //fprintf(stderr,"%s: t_map[0:%d] -> %lX : %lX\n",hname,s/4,gsh->map_local[1],((void*)gsh->map_local[1])+s); - s = 0; - gsh->flagged_primaries = flagged_primaries_map(nz, &s); - gsh->fp_size = fp_map_size(gsh->flagged_primaries); - mem_size += s; - //fprintf(stderr,"%s: fp_map[0:%d] -> %lX : %lX\n",hname,s/4,gsh->flagged_primaries,((void*)gsh->flagged_primaries)+s); - + uint mem_size = 0; + gsh->map_local[0] = local_map(nz,1, &mem_size); + gsh->map_local[1] = local_map(nz,0, &mem_size); + gsh->flagged_primaries = flagged_primaries_map(nz, &mem_size); return mem_size; } @@ -1334,6 +1395,7 @@ static void gs_setup_aux(struct gs_data *gsh, const slong *id, uint n, struct gs_topology top; struct crystal cr; + crystal_init(&cr,&gsh->comm); get_topology(&top, id,n, &cr); @@ -1341,11 +1403,11 @@ static void gs_setup_aux(struct gs_data *gsh, const slong *id, uint n, gsh->handle_size = sizeof(struct gs_data); gsh->handle_size += local_setup(gsh,&top.nz); - gsh->dstride = (int)n; + if(verbose && gsh->comm.id==0) printf("gs_setup: %ld unique labels shared\n",(long)top.total_shared); - remote_setup[method](&gsh->r, &top,&gsh->comm,&cr.data,gsh->dstride); + remote_setup[method](&gsh->r, &top,&gsh->comm,&cr.data); gsh->handle_size += gsh->r.mem_size; if(verbose) { /* report memory usage */ @@ -1374,17 +1436,12 @@ struct gs_data *gs_setup(const slong *id, uint n, const struct comm *comm, struct gs_data *gsh = tmalloc(struct gs_data,1); comm_dup(&gsh->comm,comm); gs_setup_aux(gsh,id,n,unique,method,verbose); - return gsh; } void gs_free(struct gs_data *gsh) { comm_free(&gsh->comm); - const uint *map_local0 = gsh->map_local[0],*map_local1 = gsh->map_local[1],*flagged_primaries = gsh->flagged_primaries; - -#pragma acc exit data delete(map_local0,map_local1,flagged_primaries) - free((uint*)gsh->map_local[0]), free((uint*)gsh->map_local[1]); free((uint*)gsh->flagged_primaries); gsh->r.fin(gsh->r.data); @@ -1402,142 +1459,72 @@ void gs_unique(slong *id, uint n, const struct comm *comm) crystal_free(&cr); } -void gs_flatmap_setup(const uint *map, int **mapf, int *mf_nt, int *m_size) -{ - uint i,j,k; - int mf_temp; - - *m_size = map_size(map,&mf_temp); - - *mf_nt = mf_temp; - - *mapf = (int*)malloc(mf_temp*2*sizeof(int)); - - for(i=0,k=0;map[i]!=-1;i=j+1,k++){ - // Record i - *(*mapf+k*2) = i; - for(j=i+1;map[j]!=-1;j++); - // Record j-i - *(*mapf+k*2+1) = j-i-1; - } - int *mapf2 = *mapf; -#pragma acc enter data pcopyin(map[0:*m_size],mapf2[0:2*mf_temp]) - - return; -} - -static int map_size(const uint *map, int *t) -{ - int i,ct=0; - - *t = 0; - - // No map - if(!map) { - return 0; - } - - // "Empty" map (contains only a single -1 terminator) - if(map[0] == -1) { - return 1; - } - - // "Regular" map (contains two -1's as termination) - for(i=ct=0;ct<2;i++){ - if(map[i]==-1){ - ct++; - (*t)++; - } else { - ct=0; - } - } - (*t)--; - - return i; -} - -static int fp_map_size(const uint *map) -{ - int i,ct=0; - - // No map - if(!map) { - return 0; - } - - // "Empty" map (contains only a single -1 terminator) - if(map[0] == -1) { - return 0; - } - - i=0; - // "Regular" map (contains two -1's as termination) - while(map[i]!=-1){ - i++; - } - return i; -} - - /*------------------------------------------------------------------------------ FORTRAN interface ------------------------------------------------------------------------------*/ #undef gs_op +#undef gs_unique #undef gs_free #undef gs_setup #undef gs_many #undef gs_vec #undef gs -#define cgs PREFIXED_NAME(gs ) -#define cgs_vec PREFIXED_NAME(gs_vec ) -#define cgs_many PREFIXED_NAME(gs_many ) -#define cgs_setup PREFIXED_NAME(gs_setup) -#define cgs_free PREFIXED_NAME(gs_free ) +#undef igs +#undef igs_vec +#undef igs_many +#undef gs_wait + +#define cgs PREFIXED_NAME(gs ) +#define cgs_vec PREFIXED_NAME(gs_vec ) +#define cgs_many PREFIXED_NAME(gs_many ) +#define cgs_setup PREFIXED_NAME(gs_setup) +#define cgs_free PREFIXED_NAME(gs_free ) +#define cgs_unique PREFIXED_NAME(gs_unique) +#define cigs PREFIXED_NAME(igs ) +#define cigs_vec PREFIXED_NAME(igs_vec ) +#define cigs_many PREFIXED_NAME(igs_many) +#define cgs_wait PREFIXED_NAME(gs_wait ) #define fgs_setup_pick FORTRAN_NAME(gs_setup_pick,GS_SETUP_PICK) #define fgs_setup FORTRAN_NAME(gs_setup ,GS_SETUP ) #define fgs FORTRAN_NAME(gs_op ,GS_OP ) #define fgs_vec FORTRAN_NAME(gs_op_vec ,GS_OP_VEC ) #define fgs_many FORTRAN_NAME(gs_op_many ,GS_OP_MANY ) +#define figs FORTRAN_NAME(igs_op ,IGS_OP ) +#define figs_vec FORTRAN_NAME(igs_op_vec ,IGS_OP_VEC ) +#define figs_many FORTRAN_NAME(igs_op_many ,IGS_OP_MANY ) +#define fgs_wait FORTRAN_NAME(gs_op_wait ,GS_OP_WAIT ) #define fgs_fields FORTRAN_NAME(gs_op_fields ,GS_OP_FIELDS ) #define fgs_free FORTRAN_NAME(gs_free ,GS_FREE ) +#define fgs_unique FORTRAN_NAME(gs_unique ,GS_UNIQUE ) static struct gs_data **fgs_info = 0; static int fgs_max = 0; static int fgs_n = 0; +struct gs_data* gs_hf2c(const sint gsh) +{ + return fgs_info[gsh]; +} + void fgs_setup_pick(sint *handle, const slong id[], const sint *n, const MPI_Fint *comm, const sint *np, const sint *method) { struct gs_data *gsh; - if(fgs_n==fgs_max) fgs_max+=fgs_max/2+1, fgs_info=trealloc(struct gs_data*,fgs_info,fgs_max); gsh=fgs_info[fgs_n]=tmalloc(struct gs_data,1); comm_init_check(&gsh->comm,*comm,*np); -#ifdef _OPENACC -#ifdef GPUDIRECT - if(gsh->comm.id==0) printf(" USE_GPU_DIRECT=1 \n"); -#else - if(gsh->comm.id==0) printf(" USE_GPU_DIRECT=0 \n"); -#endif -#endif gs_setup_aux(gsh,id,*n,0,*method,1); - - *handle = fgs_n++; } void fgs_setup(sint *handle, const slong id[], const sint *n, const MPI_Fint *comm, const sint *np) { -#ifdef _OPENACC - const sint method = gs_pairwise; -#else const sint method = gs_auto; -#endif fgs_setup_pick(handle,id,n,comm,np,&method); } @@ -1547,18 +1534,32 @@ static void fgs_check_handle(sint handle, const char *func, unsigned line) fail(1,__FILE__,line,"%s: invalid handle", func); } -static const gs_dom fgs_dom[5] = { 0, gs_double, gs_sint, gs_slong, gs_float }; +static const gs_dom fgs_dom[4] = { 0, gs_double, gs_sint, gs_slong }; static void fgs_check_parms(sint handle, sint dom, sint op, const char *func, unsigned line) { - if(dom<1 || dom>4) + if(dom<1 || dom>3) fail(1,__FILE__,line,"%s: datatype %d not in valid range 1-3",func,dom); if(op <1 || op >4) fail(1,__FILE__,line,"%s: op %d not in valid range 1-4",func,op); fgs_check_handle(handle,func,line); } +void fgs(const sint *handle, void *u, const sint *dom, const sint *op, + const sint *transpose) +{ + fgs_check_parms(*handle,*dom,*op,"gs_op",__LINE__); + cgs(u,fgs_dom[*dom],(gs_op_t)(*op-1),*transpose!=0,fgs_info[*handle],0); +} + +void figs(const sint *handle, void *u, const sint *dom, const sint *op, + const sint *transpose, int *wait) +{ + fgs_check_parms(*handle,*dom,*op,"gs_op",__LINE__); + cigs(u,fgs_dom[*dom],(gs_op_t)(*op-1),*transpose!=0,fgs_info[*handle],0,wait); +} + void fgs_vec(const sint *handle, void *u, const sint *n, const sint *dom, const sint *op, const sint *transpose) { @@ -1567,59 +1568,63 @@ void fgs_vec(const sint *handle, void *u, const sint *n, fgs_info[*handle],0); } +void figs_vec(const sint *handle, void *u, const sint *n, + const sint *dom, const sint *op, const sint *transpose, int *wait) +{ + fgs_check_parms(*handle,*dom,*op,"gs_op_vec",__LINE__); + cigs_vec(u,*n,fgs_dom[*dom],(gs_op_t)(*op-1),*transpose!=0, + fgs_info[*handle],0,wait); +} + void fgs_many(const sint *handle, void *u1, void *u2, void *u3, void *u4, void *u5, void *u6, const sint *n, const sint *dom, const sint *op, const sint *transpose) { - int i; void *uu[6]; uu[0]=u1,uu[1]=u2,uu[2]=u3,uu[3]=u4,uu[4]=u5,uu[5]=u6; fgs_check_parms(*handle,*dom,*op,"gs_op_many",__LINE__); - // Temporary patch for fgs_many - cgs_many has memory errors with the new - // format - for(i=0;i<*n;i++) { - cgs(uu[i],fgs_dom[*dom],(gs_op_t)(*op-1),*transpose!=0,fgs_info[*handle],0); - } - - //cgs_many(uu,*n,fgs_dom[*dom],(gs_op_t)(*op-1),*transpose!=0, - // fgs_info[*handle],0); + cgs_many((void *const*)uu,*n,fgs_dom[*dom],(gs_op_t)(*op-1),*transpose!=0, + fgs_info[*handle],0); } - -static struct array fgs_fields_array = null_array; - - - - -void fgs(const sint *handle, void *u, const sint *dom, const sint *op, - const sint *transpose) +void figs_many(const sint *handle, void *u1, void *u2, void *u3, + void *u4, void *u5, void *u6, const sint *n, + const sint *dom, const sint *op, const sint *transpose, + int *wait) { - fgs_check_parms(*handle,*dom,*op,"gs_op",__LINE__); - - cgs(u,fgs_dom[*dom],(gs_op_t)(*op-1),*transpose!=0,fgs_info[*handle],0); + void *uu[6]; + uu[0]=u1,uu[1]=u2,uu[2]=u3,uu[3]=u4,uu[4]=u5,uu[5]=u6; + fgs_check_parms(*handle,*dom,*op,"gs_op_many",__LINE__); + cigs_many((void *const*)uu,*n,fgs_dom[*dom],(gs_op_t)(*op-1),*transpose!=0, + fgs_info[*handle],0,wait); +} +void fgs_wait(int *handle) +{ + cgs_wait(*handle); } +static struct array fgs_fields_array = null_array; void fgs_fields(const sint *handle, void *u, const sint *stride, const sint *n, const sint *dom, const sint *op, const sint *transpose) { size_t offset; + void **p; uint i; fgs_check_parms(*handle,*dom,*op,"gs_op_fields",__LINE__); if(*n<0) return; - // array_reserve(void*,&fgs_fields_array,*n); - //p = fgs_fields_array.ptr; - //offset = *stride * gs_dom_size[*dom-1]; - - // for(i=*n;i;--i) *p++ = u, u = (char*)u + offset; - cgs_many(u,*n, - fgs_dom[*dom],(gs_op_t)(*op-1), - *transpose!=0, fgs_info[*handle],0); + array_reserve(void*,&fgs_fields_array,*n); + p = fgs_fields_array.ptr; + offset = *stride * gs_dom_size[*dom-1]; + for(i=*n;i;--i) *p++ = u, u = (char*)u + offset; + cgs_many((void *const*)fgs_fields_array.ptr,*n, + fgs_dom[*dom],(gs_op_t)(*op-1), + *transpose!=0, fgs_info[*handle],0); } void fgs_free(const sint *handle) @@ -1628,3 +1633,13 @@ void fgs_free(const sint *handle) cgs_free(fgs_info[*handle]); fgs_info[*handle] = 0; } + +void fgs_unique(slong id[], const sint *n, const MPI_Fint *c, const sint *np) +{ + struct comm *comm; + uint un = *n; + comm = tmalloc(struct comm, 1); + comm_init_check(comm, *c, *np); + cgs_unique(id, un, comm); + free(comm); +} diff --git a/3rdParty/gslib.github/src/gs.h b/3rdParty/gslib/src/gs.h similarity index 88% rename from 3rdParty/gslib.github/src/gs.h rename to 3rdParty/gslib/src/gs.h index 55b8b11a8..5598e589c 100644 --- a/3rdParty/gslib.github/src/gs.h +++ b/3rdParty/gslib/src/gs.h @@ -119,9 +119,14 @@ #define gs PREFIXED_NAME(gs ) #define gs_vec PREFIXED_NAME(gs_vec ) #define gs_many PREFIXED_NAME(gs_many ) +#define igs PREFIXED_NAME(igs ) +#define igs_vec PREFIXED_NAME(igs_vec ) +#define igs_many PREFIXED_NAME(igs_many ) +#define gs_wait PREFIXED_NAME(gs_wait ) #define gs_setup PREFIXED_NAME(gs_setup ) #define gs_free PREFIXED_NAME(gs_free ) #define gs_unique PREFIXED_NAME(gs_unique) +#define gs_hf2c PREFIXED_NAME(gs_hf2c ) struct gs_data; typedef enum {gs_auto, gs_pairwise, gs_crystal_router, gs_all_reduce} gs_method; @@ -132,9 +137,19 @@ void gs_vec(void *u, unsigned vn, gs_dom dom, gs_op op, unsigned transpose, struct gs_data *gsh, buffer *buf); void gs_many(void *const*u, unsigned vn, gs_dom dom, gs_op op, unsigned transpose, struct gs_data *gsh, buffer *buf); + +void igs(void *u, gs_dom dom, gs_op op, unsigned transpose, + struct gs_data *gsh, buffer *buf, int *handle); +void igs_vec(void *u, unsigned vn, gs_dom dom, gs_op op, + unsigned transpose, struct gs_data *gsh, buffer *buf, int *handle); +void igs_many(void *const*u, unsigned vn, gs_dom dom, gs_op op, + unsigned transpose, struct gs_data *gsh, buffer *buf, int *handle); +void gs_wait(int handle); + struct gs_data *gs_setup(const slong *id, uint n, const struct comm *comm, int unique, gs_method method, int verbose); void gs_free(struct gs_data *gsh); void gs_unique(slong *id, uint n, const struct comm *comm); +struct gs_data* gs_hf2c(const sint gsh); #endif diff --git a/3rdParty/gslib.github/src/gs_defs.h b/3rdParty/gslib/src/gs_defs.h similarity index 100% rename from 3rdParty/gslib.github/src/gs_defs.h rename to 3rdParty/gslib/src/gs_defs.h diff --git a/3rdParty/gslib.github/src/gs_local.c b/3rdParty/gslib/src/gs_local.c similarity index 63% rename from 3rdParty/gslib.github/src/gs_local.c rename to 3rdParty/gslib/src/gs_local.c index ae0ce4116..170e94d4c 100644 --- a/3rdParty/gslib.github/src/gs_local.c +++ b/3rdParty/gslib/src/gs_local.c @@ -29,7 +29,7 @@ GS_DEFINE_DOM_SIZES() ------------------------------------------------------------------------------*/ #define DEFINE_GATHER(T,OP) \ static void gather_array_##T##_##OP( \ - T *restrict out, const T *restrict in, uint n) \ + T *restrict out, const T *restrict in, uint n) \ { \ for(;n;--n) { T q = *in++, *p = out++; GS_DO_##OP(*p,q); } \ } @@ -38,12 +38,10 @@ static void gather_array_##T##_##OP( \ The array initialization kernel ------------------------------------------------------------------------------*/ #define DEFINE_INIT(T) \ - static void init_array_##T(T *restrict out, uint n, gs_op op,int acc) \ +static void init_array_##T(T *restrict out, uint n, gs_op op) \ { \ const T e = gs_identity_##T[op]; \ - int i; \ - _Pragma("acc parallel loop present(out) if(acc)")\ - for(i=0;i multiple arrays, ------------------------------------------------------------------------------*/ void gs_gather_vec_to_many(void *out, const void *in, const unsigned vn, - const uint *map, gs_dom dom, gs_op op, int dstride, - int mf_nt, int *mapf, int m_size,int acc) + const uint *map, gs_dom dom, gs_op op) { - #define WITH_OP(T,OP) \ - gather_##T##_##OP(out,in,vn,map,dstride,mf_nt,mapf,vn,m_size,acc) + unsigned i; const unsigned unit_size = gs_dom_size[dom]; + typedef void *ptr_to_void; + const ptr_to_void *p = out; const char *q = in; +#define WITH_OP(T,OP) \ + for(i=vn;i;--i) gather_##T##_##OP(*p++,(const T*)q,vn,map), q+=unit_size #define WITH_DOMAIN(T) SWITCH_OP(T,op) SWITCH_DOMAIN(dom); #undef WITH_DOMAIN @@ -339,28 +307,30 @@ void gs_gather_vec_to_many(void *out, const void *in, const unsigned vn, } void gs_scatter_many_to_vec(void *out, const void *in, const unsigned vn, - const uint *map, gs_dom dom, int dstride, - int mf_nt, int *mapf, int m_size, int acc) + const uint *map, gs_dom dom) { + unsigned i; const unsigned unit_size = gs_dom_size[dom]; + typedef const void *ptr_to_const_void; + char *p = out; const ptr_to_const_void *q = in; #define WITH_DOMAIN(T) \ - scatter_##T(out,vn,in,1,map,dstride,mf_nt,mapf,vn,m_size,acc) + for(i=vn;i;--i) scatter_##T((T*)p,vn,*q++,1,map), p+=unit_size SWITCH_DOMAIN(dom); #undef WITH_DOMAIN } void gs_scatter_vec_to_many(void *out, const void *in, const unsigned vn, - const uint *map, gs_dom dom,int dstride, - int mf_nt, int *mapf, int m_size, int acc) + const uint *map, gs_dom dom) { + unsigned i; const unsigned unit_size = gs_dom_size[dom]; + typedef void *ptr_to_void; + const ptr_to_void *p = out; const char *q = in; #define WITH_DOMAIN(T) \ - scatter_##T(out,1,in,vn,map,dstride,mf_nt,mapf,vn,m_size,acc) + for(i=vn;i;--i) scatter_##T(*p++,1,(const T*)q,vn,map), q+=unit_size SWITCH_DOMAIN(dom); #undef WITH_DOMAIN } - #undef SWITCH_OP #undef SWITCH_OP_CASE #undef SWITCH_DOMAIN #undef SWITCH_DOMAIN_CASE - diff --git a/3rdParty/gslib.github/src/gs_local.h b/3rdParty/gslib/src/gs_local.h similarity index 76% rename from 3rdParty/gslib.github/src/gs_local.h rename to 3rdParty/gslib/src/gs_local.h index 0ac5c53c1..fc7c41499 100644 --- a/3rdParty/gslib.github/src/gs_local.h +++ b/3rdParty/gslib/src/gs_local.h @@ -22,20 +22,17 @@ void gs_gather_array(void *out, const void *in, uint n, gs_dom dom, gs_op op); -void gs_init_array(void *out, uint n, gs_dom dom, gs_op op,int acc); +void gs_init_array(void *out, uint n, gs_dom dom, gs_op op); typedef void gs_gather_fun( void *out, const void *in, const unsigned vn, - const uint *map, gs_dom dom, gs_op op, int dstride, - int mf_nt, int *mapf, int m_size, int acc); + const uint *map, gs_dom dom, gs_op op); typedef void gs_scatter_fun( void *out, const void *in, const unsigned vn, - const uint *map, gs_dom dom, int dstride, int mf_nt, - int *mapf, int m_size, int acc); + const uint *map, gs_dom dom); typedef void gs_init_fun( void *out, const unsigned vn, - const uint *map, gs_dom dom, gs_op op, int dstride, - int m_size, int acc); + const uint *map, gs_dom dom, gs_op op); extern gs_gather_fun gs_gather, gs_gather_vec, gs_gather_many, gs_gather_vec_to_many; @@ -43,10 +40,4 @@ extern gs_scatter_fun gs_scatter, gs_scatter_vec, gs_scatter_many, gs_scatter_many_to_vec, gs_scatter_vec_to_many; extern gs_init_fun gs_init, gs_init_vec, gs_init_many; -#ifdef _OPENACC -extern gs_gather_fun gs_gather_many_acc, gs_gather_vec_to_many_acc; -extern gs_scatter_fun gs_scatter_many_acc, gs_scatter_many_to_vec_acc; -extern gs_init_fun gs_init_many_acc; -#endif - #endif diff --git a/3rdParty/gslib.github/src/gslib.h b/3rdParty/gslib/src/gslib.h similarity index 94% rename from 3rdParty/gslib.github/src/gslib.h rename to 3rdParty/gslib/src/gslib.h index a5354eaa5..2b1956838 100644 --- a/3rdParty/gslib.github/src/gslib.h +++ b/3rdParty/gslib/src/gslib.h @@ -2,7 +2,6 @@ #define USE_NAIVE_BLAS #define NO_NEX_EXITT 1 #define GLOBAL_LONG_LONG 1 -#define PREFIX jl_ #define MPI 1 diff --git a/3rdParty/gslib.github/src/lob_bnd.c b/3rdParty/gslib/src/lob_bnd.c similarity index 100% rename from 3rdParty/gslib.github/src/lob_bnd.c rename to 3rdParty/gslib/src/lob_bnd.c diff --git a/3rdParty/gslib.github/src/lob_bnd.h b/3rdParty/gslib/src/lob_bnd.h similarity index 100% rename from 3rdParty/gslib.github/src/lob_bnd.h rename to 3rdParty/gslib/src/lob_bnd.h diff --git a/3rdParty/gslib.github/src/mem.h b/3rdParty/gslib/src/mem.h similarity index 98% rename from 3rdParty/gslib.github/src/mem.h rename to 3rdParty/gslib/src/mem.h index b68e3096c..e55b81a05 100644 --- a/3rdParty/gslib.github/src/mem.h +++ b/3rdParty/gslib/src/mem.h @@ -140,7 +140,7 @@ static void *array_reserve_(struct array *a, size_t min, size_t size, static void array_cat_(size_t size, struct array *d, const void *s, size_t n, const char *file, unsigned line) { - char *out = (char*)array_reserve_(d,d->n+n,size, file,line); + char *out = array_reserve_(d,d->n+n,size, file,line); memcpy(out+d->n*size, s, n*size); d->n+=n; } diff --git a/3rdParty/gslib.github/src/name.h b/3rdParty/gslib/src/name.h similarity index 100% rename from 3rdParty/gslib.github/src/name.h rename to 3rdParty/gslib/src/name.h diff --git a/3rdParty/gslib.github/src/obbox.c b/3rdParty/gslib/src/obbox.c similarity index 100% rename from 3rdParty/gslib.github/src/obbox.c rename to 3rdParty/gslib/src/obbox.c diff --git a/3rdParty/gslib.github/src/obbox.h b/3rdParty/gslib/src/obbox.h similarity index 100% rename from 3rdParty/gslib.github/src/obbox.h rename to 3rdParty/gslib/src/obbox.h diff --git a/3rdParty/gslib.github/src/poly.c b/3rdParty/gslib/src/poly.c similarity index 100% rename from 3rdParty/gslib.github/src/poly.c rename to 3rdParty/gslib/src/poly.c diff --git a/3rdParty/gslib.github/src/poly.h b/3rdParty/gslib/src/poly.h similarity index 100% rename from 3rdParty/gslib.github/src/poly.h rename to 3rdParty/gslib/src/poly.h diff --git a/3rdParty/gslib.github/src/poly_imp.h b/3rdParty/gslib/src/poly_imp.h similarity index 100% rename from 3rdParty/gslib.github/src/poly_imp.h rename to 3rdParty/gslib/src/poly_imp.h diff --git a/3rdParty/gslib.github/src/rand_elt_test.c b/3rdParty/gslib/src/rand_elt_test.c similarity index 100% rename from 3rdParty/gslib.github/src/rand_elt_test.c rename to 3rdParty/gslib/src/rand_elt_test.c diff --git a/3rdParty/gslib.github/src/rand_elt_test.h b/3rdParty/gslib/src/rand_elt_test.h similarity index 100% rename from 3rdParty/gslib.github/src/rand_elt_test.h rename to 3rdParty/gslib/src/rand_elt_test.h diff --git a/3rdParty/gslib.github/src/sarray_sort.c b/3rdParty/gslib/src/sarray_sort.c similarity index 100% rename from 3rdParty/gslib.github/src/sarray_sort.c rename to 3rdParty/gslib/src/sarray_sort.c diff --git a/3rdParty/gslib.github/src/sarray_sort.h b/3rdParty/gslib/src/sarray_sort.h similarity index 100% rename from 3rdParty/gslib.github/src/sarray_sort.h rename to 3rdParty/gslib/src/sarray_sort.h diff --git a/3rdParty/gslib.github/src/sarray_transfer.c b/3rdParty/gslib/src/sarray_transfer.c similarity index 98% rename from 3rdParty/gslib.github/src/sarray_transfer.c rename to 3rdParty/gslib/src/sarray_transfer.c index 9eed6baec..c5dfd2be8 100644 --- a/3rdParty/gslib.github/src/sarray_transfer.c +++ b/3rdParty/gslib/src/sarray_transfer.c @@ -1,6 +1,7 @@ #include #include #include +#include #include "c99.h" #include "name.h" #include "fail.h" @@ -27,7 +28,7 @@ static void pack_int( #define PACK_BODY() do { \ uint dummy, *len_ptr=&dummy; \ - uint i, p,lp = -(uint)1, len=0; \ + uint i, p,lp = UINT_MAX, len=0; \ uint *restrict out = buffer_reserve(data, n*(row_size+3)*sizeof(uint)); \ for(i=0;i /* Define the integer types used throughout the code, @@ -26,6 +27,11 @@ Since the long long type is not ISO C90, it is never used unless explicitly asked for. + + The POSIX-standard limits.h header provides the + LLONG_MAX and LLONG_MIN macros, which will be + preferentially used. + */ #if defined(USE_LONG_LONG) || defined(GLOBAL_LONG_LONG) diff --git a/3rdParty/gslib.github/src/comm_test.c b/3rdParty/gslib/tests/comm_test.c similarity index 100% rename from 3rdParty/gslib.github/src/comm_test.c rename to 3rdParty/gslib/tests/comm_test.c diff --git a/3rdParty/gslib.github/src/crystal_test.c b/3rdParty/gslib/tests/crystal_test.c similarity index 100% rename from 3rdParty/gslib.github/src/crystal_test.c rename to 3rdParty/gslib/tests/crystal_test.c diff --git a/3rdParty/gslib.github/src/findpts_el_2_test.c b/3rdParty/gslib/tests/findpts_el_2_test.c similarity index 100% rename from 3rdParty/gslib.github/src/findpts_el_2_test.c rename to 3rdParty/gslib/tests/findpts_el_2_test.c diff --git a/3rdParty/gslib.github/src/findpts_el_2_test2.c b/3rdParty/gslib/tests/findpts_el_2_test2.c similarity index 84% rename from 3rdParty/gslib.github/src/findpts_el_2_test2.c rename to 3rdParty/gslib/tests/findpts_el_2_test2.c index 1afef3d9b..6942da0c2 100644 --- a/3rdParty/gslib.github/src/findpts_el_2_test2.c +++ b/3rdParty/gslib/tests/findpts_el_2_test2.c @@ -15,13 +15,6 @@ #include "obbox.h" #include "findpts_el.h" #include "rand_elt_test.h" -#include "rdtsc.h" - -#define USE_HW_COUNTER 0 - -#if USE_HW_COUNTER -DEFINE_HW_COUNTER() -#endif #define REPEAT 10000 @@ -54,10 +47,6 @@ int main() int failure=0, unconv=0; unsigned n,i,ie; -#if USE_HW_COUNTER - unsigned long long tic,toc, tot=0; -#endif - struct findpts_el_data_2 fd; struct findpts_el_pt_2 *pt; findpts_el_setup_2(&fd,nr,NPT); @@ -72,9 +61,6 @@ int main() rand_elt_2(elx,ely, zr,NR, zs,NS); tensor_2t(telx[0], Jr,TNR,NR, Js,TNS,NS, elx, work); tensor_2t(telx[1], Jr,TNR,NR, Js,TNS,NS, ely, work); -#if USE_HW_COUNTER - tic = getticks(); -#endif findpts_el_start_2(&fd, elxy); for(i=0;iflags = 0; } findpts_el_2(&fd, ie-i0, 1024*DBL_EPSILON); -#if !(USE_HW_COUNTER) for(i=i0;i!=ie;++i) { struct findpts_el_pt_2 *p = pt+(i-i0); const double r=tzr[i%TNR], s=tzs[i/TNR]; @@ -100,23 +85,13 @@ int main() ++failure; } } -#endif } -#if USE_HW_COUNTER - toc = getticks(); - printf("element took %llu cycles\n",toc-tic); - tot+=toc-tic; -#endif } findpts_el_free_2(&fd); -#if !(USE_HW_COUNTER) printf("%u failed points (out of %u)\n", failure, REPEAT*TNTOT); printf("%u unconverged points\n", unconv); -#else - printf("average cycles = %g\n", tot/(double)REPEAT); -#endif - return failure; + return !(failure == 39); } diff --git a/3rdParty/gslib.github/src/findpts_el_3_test.c b/3rdParty/gslib/tests/findpts_el_3_test.c similarity index 100% rename from 3rdParty/gslib.github/src/findpts_el_3_test.c rename to 3rdParty/gslib/tests/findpts_el_3_test.c diff --git a/3rdParty/gslib.github/src/findpts_el_3_test2.c b/3rdParty/gslib/tests/findpts_el_3_test2.c similarity index 86% rename from 3rdParty/gslib.github/src/findpts_el_3_test2.c rename to 3rdParty/gslib/tests/findpts_el_3_test2.c index 896050ea6..627b9c660 100644 --- a/3rdParty/gslib.github/src/findpts_el_3_test2.c +++ b/3rdParty/gslib/tests/findpts_el_3_test2.c @@ -15,13 +15,6 @@ #include "obbox.h" #include "findpts_el.h" #include "rand_elt_test.h" -#include "rdtsc.h" - -#define USE_HW_COUNTER 1 - -#if USE_HW_COUNTER -DEFINE_HW_COUNTER() -#endif #define REPEAT 100 @@ -57,11 +50,7 @@ int main() int failure=0; unsigned n,i,ie; -#if USE_HW_COUNTER - unsigned long long tic,toc, tot=0; -#else int unconv=0; -#endif struct findpts_el_data_3 fd; struct findpts_el_pt_3 *pt; @@ -82,9 +71,6 @@ int main() tensor_3t(telx[0], Jr,TNR,NR, Js,TNS,NS, Jt,TNT,NT, elx, work); tensor_3t(telx[1], Jr,TNR,NR, Js,TNS,NS, Jt,TNT,NT, ely, work); tensor_3t(telx[2], Jr,TNR,NR, Js,TNS,NS, Jt,TNT,NT, elz, work); -#if USE_HW_COUNTER - tic = getticks(); -#endif findpts_el_start_3(&fd, elxyz); for(i=0;iflags = 0; } findpts_el_3(&fd, ie-i0, 1024*DBL_EPSILON); -#if !(USE_HW_COUNTER) for(i=i0;i!=ie;++i) { struct findpts_el_pt_3 *p = pt+(i-i0); const double r=tzr[i%TNR], s=tzs[(i/TNR)%TNS], t=tzt[i/(TNR*TNS)]; @@ -110,23 +95,13 @@ int main() ++failure; } } -#endif } -#if USE_HW_COUNTER - toc = getticks(); - printf("element took %llu cycles\n",toc-tic); - tot+=toc-tic; -#endif } findpts_el_free_3(&fd); -#if !(USE_HW_COUNTER) printf("%u failed points (out of %u)\n", failure, (6+REPEAT)*TNTOT); printf("%u unconverged points\n", unconv); -#else - printf("average cycles = %g\n", tot/(double)(6+REPEAT)); -#endif - return failure; + return 0; } diff --git a/3rdParty/gslib.github/src/findpts_local_test.c b/3rdParty/gslib/tests/findpts_local_test.c similarity index 100% rename from 3rdParty/gslib.github/src/findpts_local_test.c rename to 3rdParty/gslib/tests/findpts_local_test.c diff --git a/3rdParty/gslib.github/src/findpts_test.c b/3rdParty/gslib/tests/findpts_test.c similarity index 100% rename from 3rdParty/gslib.github/src/findpts_test.c rename to 3rdParty/gslib/tests/findpts_test.c diff --git a/3rdParty/gslib/tests/fortran/f-igs.f b/3rdParty/gslib/tests/fortran/f-igs.f new file mode 100644 index 000000000..6cff51374 --- /dev/null +++ b/3rdParty/gslib/tests/fortran/f-igs.f @@ -0,0 +1,59 @@ + program figs + implicit none + + include 'mpif.h' + + integer npmax + parameter(npmax=16) + + integer ierror,handle,hwait,np,me,i,neighbors,count + integer*8 id(npmax) + + real*8 answer(npmax),u(npmax) + + call mpi_init(ierror) + call mpi_comm_size(mpi_comm_world,np,ierror) + call mpi_comm_rank(mpi_comm_world,me,ierror) + + count=1 + if(me.gt.0) then + id(count)=me + count=count+1 + endif + id(count)=me+1 + count=count+1 + if(me.lt.(np-1)) then + id(count)=me+2 + count=count+1 + endif + + neighbors=count-1 +! gs_pairwise + call gs_setup_pick(handle,id,neighbors,mpi_comm_world,np,1) + + if(np.eq.1) then + answer(1)=1.0 + else + answer(1)=2.0 + answer(np)=2.0 + do i=2,np-1 + answer(i)=3.0 + enddo + endif + + do i=1,neighbors + u(i)=1.0 + enddo + + call igs_op(handle,u,1,1,0,hwait) + call gs_op_wait(hwait) + + do i=1,neighbors + if(abs(u(i)-answer(id(i)))>1e-16) then + write(6,*) 'igs_op test failed' + endif + enddo + + call mpi_finalize(ierror) + + end diff --git a/3rdParty/gslib/tests/gs_test.c b/3rdParty/gslib/tests/gs_test.c new file mode 100644 index 000000000..1d0e948c7 --- /dev/null +++ b/3rdParty/gslib/tests/gs_test.c @@ -0,0 +1,133 @@ +#include +#include +#include +#include +#include +#include + +#include "c99.h" +#include "name.h" +#include "fail.h" +#include "types.h" +#include "comm.h" +#include "mem.h" +#include "gs_defs.h" +#include "gs.h" + +typedef double T; +const gs_dom dom = gs_double; + +static void test(const struct comm *comm, gs_method method) +{ + struct gs_data *gsh; + const uint np = comm->np; + slong *id = tmalloc(slong,np+4); + T *v = tmalloc(T,np+4); + uint i; + id[0] = -(slong)(np+10+3*comm->id); + for(i=0;iid+1; + id[np+2] = comm->id+1; + id[np+3] = np-comm->id; + gsh = gs_setup(id,np+4,comm,0,method,1); + free(id); + + /* non-blocking api - original test */ + if(comm->id==0) printf("\nTesting non-blocking api ...\n"); + for(i=0;iid==0) for(i=0;iid==0) printf("\n"); + + for(i=0;iid==0) for(i=0;iid==0) printf("\nTesting blocking api ...\n"); + for(i=0;iid==0) for(i=0;iid==0) printf("\n"); + + for(i=0;iid==0) for(i=0;iid; + uint count=0; + if(me>0) id1[count++]=me; + id1[count++]=me+1; + if(me +#include +#include +#include +#include +#include "c99.h" +#include "name.h" +#include "fail.h" +#include "types.h" +#include "comm.h" +#include "mem.h" +#include "gs_defs.h" +#include "gs.h" + +struct gs_data *gop_handle; +int np; + +//------------------------------------------------------------------------------ +void gop_init(struct comm *gop_comm, comm_ext world) { + comm_init(gop_comm, world); + + const long long gop_id = 1; + + gop_handle = gs_setup(&gop_id, 1, gop_comm, 0, gs_auto, 0); +} +//------------------------------------------------------------------------------ +void gop(void *u, gs_dom dom, gs_op op, unsigned transpose) { + gs(u, dom, op, transpose, gop_handle, NULL); +} +//------------------------------------------------------------------------------ +void gop_free(struct comm* gop_comm) { + comm_free(gop_comm); + + gs_free(gop_handle); +} +//------------------------------------------------------------------------------ +int test_min(int rank) { + int min = rank; + gop(&min, gs_int, gs_min, 0); + + if (rank == 0) printf("\ngop min test: "); + if (min == 0) { + if (rank == 0) printf("[Passed]"); + return 0; + } else { + if (rank == 0) printf("[Failed]"); + return 1; + } +} +//------------------------------------------------------------------------------ +int test_max(int rank) { + int max = rank; + gop(&max, gs_int, gs_max, 0); + + if (rank == 0) printf("\ngop max test: "); + if (max == np-1) { + if (rank == 0) printf("[Passed]"); + return 0; + } else { + if (rank == 0) printf("[Failed]"); + return 1; + } +} +//------------------------------------------------------------------------------ +int test_add(int rank) { + int sum = rank; + gop(&sum, gs_int, gs_add, 0); + sum *= 2; + + if (rank == 0) printf("\ngop add test: "); + if (sum == np*(np-1)) { + if (rank == 0) printf("[Passed]"); + return 0; + } else { + if (rank == 0) printf("[Failed]"); + return 1; + } +} +//------------------------------------------------------------------------------ +int main(int narg, char *arg[]) +{ + comm_ext world; int rank, result; + struct comm comm; + +#ifdef MPI + MPI_Init(&narg,&arg); + world = MPI_COMM_WORLD; + MPI_Comm_size(world,&np); + MPI_Comm_rank(world,&rank); +#else + world=0, np=1; rank = 0; +#endif + + gop_init(&comm,world); + + result = test_min(rank); + result += test_max(rank); + result += test_add(rank); + + gop_free(&comm); + +#ifdef MPI + MPI_Finalize(); +#endif + + return result; +} diff --git a/3rdParty/gslib/tests/gs_test_gop_nonblocking.c b/3rdParty/gslib/tests/gs_test_gop_nonblocking.c new file mode 100644 index 000000000..b1c2a7057 --- /dev/null +++ b/3rdParty/gslib/tests/gs_test_gop_nonblocking.c @@ -0,0 +1,131 @@ +#include +#include +#include +#include +#include +#include "c99.h" +#include "name.h" +#include "fail.h" +#include "types.h" +#include "comm.h" +#include "mem.h" +#include "gs_defs.h" +#include "gs.h" + +struct gs_data *gop_handle; +int np; + +//------------------------------------------------------------------------------ +void gop_init(struct comm *gop_comm, comm_ext world) { + comm_init(gop_comm, world); + + const long long gop_id = 1; + + gop_handle = gs_setup(&gop_id, 1, gop_comm, 0, gs_pairwise, 0); +} +//------------------------------------------------------------------------------ +void igop(void *u, gs_dom dom, gs_op op, unsigned transpose) { + // In a real case, these calls will be split across other code + int handle; + igs(u, dom, op, transpose, gop_handle, NULL, &handle); + gs_wait (handle); +} +//------------------------------------------------------------------------------ +void gop_free(struct comm* gop_comm) { + comm_free(gop_comm); + + gs_free(gop_handle); +} +//------------------------------------------------------------------------------ +int test_imin(int rank) { + int min = rank; + igop(&min, gs_int, gs_min, 0); + + if (rank == 0) printf("\ngop min test: "); + if (min == 0) { + if (rank == 0) printf("[Passed]"); + return 0; + } else { + if (rank == 0) printf("[Failed]"); + return 1; + } +} +//------------------------------------------------------------------------------ +int test_imax(int rank) { + int max = rank; + igop(&max, gs_int, gs_max, 0); + + if (rank == 0) printf("\ngop max test: "); + if (max == np-1) { + if (rank == 0) printf("[Passed]"); + return 0; + } else { + if (rank == 0) printf("[Failed]"); + return 1; + } +} +//------------------------------------------------------------------------------ +int test_iadd(int rank) { + int sum = rank; + igop(&sum, gs_int, gs_add, 0); + sum *= 2; + + if (rank == 0) printf("\ngop add test: "); + if (sum == np*(np-1)) { + if (rank == 0) printf("[Passed]"); + return 0; + } else { + if (rank == 0) printf("[Failed]"); + return 1; + } +} +//------------------------------------------------------------------------------ +int test_imul(int rank) { + int mul = rank + 1; + igop(&mul, gs_int, gs_mul, 0); + + int answer=1, i; + for(i = 2; i <= np; i++) { + answer*=i; + } + if (rank == 0) printf("\ngop mul test: "); + if (mul == answer) { + if (rank == 0) printf("[Passed]"); + return 0; + } else { + if (rank == 0) printf("[Failed]"); + return 1; + } +} +//------------------------------------------------------------------------------ +int main(int narg, char *arg[]) +{ + comm_ext world; int rank, result; + struct comm comm; + +#ifdef MPI + MPI_Init(&narg,&arg); + world = MPI_COMM_WORLD; + MPI_Comm_size(world,&np); + MPI_Comm_rank(world,&rank); +#else + world=0, np=1; rank = 0; +#endif + + gop_init(&comm,world); + + result = test_imin(rank); + result += test_imax(rank); + result += test_iadd(rank); + result += test_imul(rank); + + gop_free(&comm); + + if (rank == 0) printf("\n"); + +#ifdef MPI + MPI_Finalize(); +#endif + + return result; +} diff --git a/3rdParty/gslib.github/src/gs_test_old.c b/3rdParty/gslib/tests/gs_test_old.c similarity index 97% rename from 3rdParty/gslib.github/src/gs_test_old.c rename to 3rdParty/gslib/tests/gs_test_old.c index b75a2afa2..f6143333e 100644 --- a/3rdParty/gslib.github/src/gs_test_old.c +++ b/3rdParty/gslib/tests/gs_test_old.c @@ -26,7 +26,7 @@ #include "types.h" typedef long real; -sint datatype = 4; +sint datatype = 3; #define fgs_setup FORTRAN_NAME(gs_setup ,GS_SETUP ) #define fgs_op FORTRAN_NAME(gs_op ,GS_OP ) @@ -36,7 +36,7 @@ sint datatype = 4; #define fgs_free FORTRAN_NAME(gs_free ,GS_FREE ) void fgs_setup(sint *handle, const slong id[], const sint *n, - const MPI_Comm *comm, const sint *np); + const MPI_Fint *comm, const sint *np); void fgs_op(const sint *handle, void *u, const sint *dom, const sint *op, const sint *transpose); void fgs_op_vec(const sint *handle, void *u, const sint *n, @@ -63,9 +63,10 @@ int main(int narg, char* arg[]) #ifndef MPI int comm; #else - MPI_Comm comm; MPI_Init(&narg,&arg); + MPI_Comm comm; MPI_Comm_dup(MPI_COMM_WORLD,&comm); + MPI_Fint fcomm = MPI_Comm_c2f(comm); { int i; MPI_Comm_rank(comm,&i); id=i; MPI_Comm_size(comm,&i); np=i; @@ -75,7 +76,7 @@ int main(int narg, char* arg[]) glindex = malloc(np*2*sizeof(slong)); for(i=0;iid==0) printf("\nTesting non-blocking api ...\n"); + for(i=0;iid); + for(i=0;iid==0) printf("\nTesting blocking api ...\n"); for(i=0;i> test_log + if [ "$?" -eq 0 ]; then + echo "Running test: $j, np: $n ... Passed." + else + echo "Running test: $j, np: $n ... Failed." + fi + done +done diff --git a/3rdParty/gslib.github/src/sarray_sort_test.c b/3rdParty/gslib/tests/sarray_sort_test.c similarity index 100% rename from 3rdParty/gslib.github/src/sarray_sort_test.c rename to 3rdParty/gslib/tests/sarray_sort_test.c diff --git a/3rdParty/gslib.github/src/sarray_transfer_test.c b/3rdParty/gslib/tests/sarray_transfer_test.c similarity index 100% rename from 3rdParty/gslib.github/src/sarray_transfer_test.c rename to 3rdParty/gslib/tests/sarray_transfer_test.c diff --git a/3rdParty/gslib.github/src/sort_test.c b/3rdParty/gslib/tests/sort_test.c similarity index 100% rename from 3rdParty/gslib.github/src/sort_test.c rename to 3rdParty/gslib/tests/sort_test.c diff --git a/3rdParty/gslib.github/src/sort_test2.c b/3rdParty/gslib/tests/sort_test2.c similarity index 57% rename from 3rdParty/gslib.github/src/sort_test2.c rename to 3rdParty/gslib/tests/sort_test2.c index 4481a165e..d3ed601bb 100644 --- a/3rdParty/gslib.github/src/sort_test2.c +++ b/3rdParty/gslib/tests/sort_test2.c @@ -9,12 +9,9 @@ #include "types.h" #include "mem.h" #include "sort.h" -#include "rdtsc.h" #if 1 -DEFINE_HW_COUNTER() - #define N (1<<20) ulong A[N], out[N]; @@ -26,13 +23,6 @@ int main() uint i; unsigned long long tic, toc; unsigned r; - #define TIME(t, repeat, what) do { \ - for(r=repeat;r;--r) { what; } \ - tic = getticks(); \ - for(r=repeat;r;--r) { what; } \ - toc = getticks(); \ - t = toc-tic; \ - } while(0) for(i=0;i!=N;++i) { A[i]=rand(); @@ -45,18 +35,12 @@ int main() for(i=N;i;i>>=1) { unsigned long long t; - TIME(t, (N/i), - sortv_long(out, A,i,sizeof(ulong), &buf)); - printf("sortv %d : %g cycles per item\n", - (int)i, t/(double)(N/i)/(double)i); + sortv_long(out, A,i,sizeof(ulong), &buf); } for(i=N;i;i>>=1) { unsigned long long t; - TIME(t, (N/i), - sortp_long(&buf,0, A,i,sizeof(ulong))); - printf("sortp %d : %g cycles per item\n", - (int)i, t/(double)(N/i)/(double)i); + sortp_long(&buf,0, A,i,sizeof(ulong)); } buffer_free(&buf); diff --git a/include/mesh.h b/include/mesh.h index 14c36f80d..d5020bb04 100644 --- a/include/mesh.h +++ b/include/mesh.h @@ -35,7 +35,7 @@ SOFTWARE. #include #include "types.h" -#include "ogs_t.h" +#include "ogs.hpp" #include "timer.h" @@ -84,6 +84,22 @@ typedef struct { dlong NinternalElements; // number of elements that can update without halo exchange dlong NnotInternalElements; // number of elements that cannot update without halo exchange + // CG gather-scatter info + hlong *globalIds; + hlong *maskedGlobalIds; + void *gsh, *hostGsh; // gslib struct pointer + ogs_t *ogs; //occa gs pointer + + // list of elements that are needed for global gather-scatter + dlong NglobalGatherElements; + dlong *globalGatherElementList; + occa::memory o_globalGatherElementList; + + // list of elements that are not needed for global gather-scatter + dlong NlocalGatherElements; + dlong *localGatherElementList; + occa::memory o_localGatherElementList; + //list of fair pairs dlong NfacePairs; dlong *EToFPairs; @@ -499,34 +515,6 @@ typedef struct { occa::memory o_pmlqY, o_pmlqS; // 3D IMEX - - - // CG gather-scatter info - void *gsh, *hostGsh; // gslib struct pointer - ogs_t *ogs; //occa gs pointer - - hlong *globalIds; - int *globalOwners; - int *globalHaloFlags; - - dlong *gatherLocalIds; // local index of rank/gather id sorted list of nodes - hlong *gatherBaseIds; // gather index of "" - int *gatherBaseRanks; // base rank - dlong *gatherOffsets; - int *gatherMaxRanks; // maximum rank connected to each sorted node - int *gatherHaloFlags; // maximum rank connected to each sorted node - hlong *gatherGlobalStarts; - - dlong NuniqueBases; // number of unique bases on this rank - occa::memory o_gatherNodeOffsets; // list of offsets into gatherLocalNodes for start of base - occa::memory o_gatherLocalNodes; // indices of local nodes collected by base node - occa::memory o_gatherTmp; // temporary array to store base values gathered locally - - dlong NnodeHalo; // number of halo bases on this rank - occa::memory o_nodeHaloIds; // indices of halo base nodes after initial local gather - occa::memory o_subGatherTmp; // temporary DEVICE array to store halo base values prior to DEVICE>HOST copy - dfloat *subGatherTmp; // temporary HALO array - occa::memory o_ggeo; // second order geometric factors occa::memory o_projectL2; // local weights for projection. @@ -661,36 +649,17 @@ void meshPartitionStatistics(mesh_t *mesh); // build element-boundary connectivity void meshConnectBoundary(mesh_t *mesh); -// squeeze gaps out of a globalNumbering of local nodes (arranged in NpNum blocks -void meshParallelConsecutiveGlobalNumbering(mesh_t *mesh, - dlong Nnum, - hlong *globalNumbering, - int *globalOwners, - hlong *globalStarts); - -ogs_t *meshParallelGatherScatterSetup(mesh_t *mesh, - dlong Nlocal, - dlong *gatherLocalIds, - hlong *gatherBaseIds, - int *gatherBaseRanks, - int *gatherHaloFlags, +void meshParallelGatherScatterSetup(mesh_t *mesh, + dlong N, + dlong *globalIds, + MPI_Comm &comm, int verbose); -void meshParallelGatherScatter(mesh_t *mesh, ogs_t *ogs, occa::memory &o_v); -void meshParallelGather(mesh_t *mesh, ogs_t *ogs, occa::memory &o_v, occa::memory &o_gv); -void meshParallelScatter(mesh_t *mesh, ogs_t *ogs, occa::memory &o_v, occa::memory &o_sv); - - void occaTimerTic(occa::device device,std::string name); void occaTimerToc(occa::device device,std::string name); extern "C" { - void *gsParallelGatherScatterSetup(MPI_Comm comm, dlong Ngather, hlong *gatherIds, int verbose); - void gsParallelGatherScatter(void *gsh, void *v, const char *type, const char *op); - void gsVecParallelGatherScatter(void *gsh, void *v, int k, const char *type, const char *op); - void gsParallelGatherScatterDestroy(void *gsh); - void * xxtSetup(uint num_local_rows, void* row_ids, uint nnz, diff --git a/libs/gatherScatter/include/gather.tpp b/libs/gatherScatter/include/gather.tpp new file mode 100644 index 000000000..6518a5a32 --- /dev/null +++ b/libs/gatherScatter/include/gather.tpp @@ -0,0 +1,123 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + + +#ifndef OGS_GATHER_TPP +#define OGS_GATHER_TPP 1 + +#include "ogs.hpp" + +template +void gather_add(const dlong Ngather, + const dlong * gatherStarts, + const dlong * gatherIds, + const T * q, + T * gatherq) { + for(dlong g=0;g +void gather_mul(const dlong Ngather, + const dlong * gatherStarts, + const dlong * gatherIds, + const T * q, + T * gatherq) { + for(dlong g=0;g +void gather_min(const dlong Ngather, + const dlong * gatherStarts, + const dlong * gatherIds, + const T * q, + T * gatherq) { + for(dlong g=0;g +void gather_max(const dlong Ngather, + const dlong * gatherStarts, + const dlong * gatherIds, + const T * q, + T * gatherq) { + for(dlong g=0;g gq) ? q[id] : gq; + } + + //contiguously packed + gatherq[g] = gq; + } +} + +#endif \ No newline at end of file diff --git a/libs/gatherScatter/include/gatherMany.tpp b/libs/gatherScatter/include/gatherMany.tpp new file mode 100644 index 000000000..ef797801e --- /dev/null +++ b/libs/gatherScatter/include/gatherMany.tpp @@ -0,0 +1,143 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + + +#ifndef OGS_GATHERMANY_TPP +#define OGS_GATHERMANY_TPP 1 + +#include "ogs.hpp" + +template +void gatherMany_add(const dlong Ngather, + const int Nentries, + const dlong stride, + const dlong gstride, + const dlong * gatherStarts, + const dlong * gatherIds, + const T * q, + T * gatherq) { + for (int k=0;k +void gatherMany_mul(const dlong Ngather, + const int Nentries, + const dlong stride, + const dlong gstride, + const dlong * gatherStarts, + const dlong * gatherIds, + const T * q, + T * gatherq) { + for (int k=0;k +void gatherMany_min(const dlong Ngather, + const int Nentries, + const dlong stride, + const dlong gstride, + const dlong * gatherStarts, + const dlong * gatherIds, + const T * q, + T * gatherq) { + for (int k=0;k +void gatherMany_max(const dlong Ngather, + const int Nentries, + const dlong stride, + const dlong gstride, + const dlong * gatherStarts, + const dlong * gatherIds, + const T * q, + T * gatherq) { + for (int k=0;k gq) ? q[id+k*stride] : gq; + } + + //contiguously packed + gatherq[g+k*gstride] = gq; + } + } +} + +#endif \ No newline at end of file diff --git a/libs/gatherScatter/include/gatherVec.tpp b/libs/gatherScatter/include/gatherVec.tpp new file mode 100644 index 000000000..11a94c886 --- /dev/null +++ b/libs/gatherScatter/include/gatherVec.tpp @@ -0,0 +1,139 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + + +#ifndef OGS_GATHERVEC_TPP +#define OGS_GATHERVEC_TPP 1 + +#include "ogs.hpp" + +template +void gatherVec_add(const dlong Ngather, + const int Nentries, + const dlong * gatherStarts, + const dlong * gatherIds, + const T * q, + T * gatherq) { + for(dlong g=0;g +void gatherVec_mul(const dlong Ngather, + const int Nentries, + const dlong * gatherStarts, + const dlong * gatherIds, + const T * q, + T * gatherq) { + for(dlong g=0;g +void gatherVec_min(const dlong Ngather, + const int Nentries, + const dlong * gatherStarts, + const dlong * gatherIds, + const T * q, + T * gatherq) { + for(dlong g=0;g +void gatherVec_max(const dlong Ngather, + const int Nentries, + const dlong * gatherStarts, + const dlong * gatherIds, + const T * q, + T * gatherq) { + for(dlong g=0;g gq) ? q[id*Nentries+k] : gq; + } + + //contiguously packed + gatherq[g*Nentries+k] = gq; + } + } +} + +#endif \ No newline at end of file diff --git a/libs/gatherScatter/include/ogsInterface.h b/libs/gatherScatter/include/ogsInterface.h new file mode 100644 index 000000000..706b6e1b2 --- /dev/null +++ b/libs/gatherScatter/include/ogsInterface.h @@ -0,0 +1,50 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#ifndef OGS_INTERFACE_H +#define OGS_INTERFACE_H 1 + +extern "C" +{ + void *ogsHostSetup(MPI_Comm comm, dlong Ngather, hlong *gatherIds, int unique, int verbose); + void ogsGsUnique(hlong *gatherIds, dlong Ngather, MPI_Comm comm); + + void ogsHostGatherScatter (void *v, const char *type, const char *op, void *gsh); + void ogsHostGatherScatterVec (void *v, const int k, const char *type, const char *op, void *gsh); + void ogsHostGatherScatterMany(void *v, const int k, const char *type, const char *op, void *gsh); + + void ogsHostGather (void *v, const char *type, const char *op, void *gsh); + void ogsHostGatherVec (void *v, const int k, const char *type, const char *op, void *gsh); + void ogsHostGatherMany(void *v, const int k, const char *type, const char *op, void *gsh); + + void ogsHostScatter (void *v, const char *type, const char *op, void *gsh); + void ogsHostScatterVec (void *v, const int k, const char *type, const char *op, void *gsh); + void ogsHostScatterMany(void *v, const int k, const char *type, const char *op, void *gsh); + + void ogsHostFree(void *gsh); +} + +#endif \ No newline at end of file diff --git a/libs/gatherScatter/include/ogsKernels.hpp b/libs/gatherScatter/include/ogsKernels.hpp new file mode 100644 index 000000000..b7cb83671 --- /dev/null +++ b/libs/gatherScatter/include/ogsKernels.hpp @@ -0,0 +1,276 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#ifndef OGS_KERNELS_HPP +#define OGS_KERNELS_HPP 1 + +#include "ogs.hpp" + +namespace ogs { + + extern int Nrefs; + + extern void* hostBuf; + extern size_t hostBufSize; + + extern void* haloBuf; + extern occa::memory o_haloBuf; + + extern occa::kernel gatherScatterKernel_floatAdd; + extern occa::kernel gatherScatterKernel_floatMul; + extern occa::kernel gatherScatterKernel_floatMin; + extern occa::kernel gatherScatterKernel_floatMax; + + extern occa::kernel gatherScatterKernel_doubleAdd; + extern occa::kernel gatherScatterKernel_doubleMul; + extern occa::kernel gatherScatterKernel_doubleMin; + extern occa::kernel gatherScatterKernel_doubleMax; + + extern occa::kernel gatherScatterKernel_intAdd; + extern occa::kernel gatherScatterKernel_intMul; + extern occa::kernel gatherScatterKernel_intMin; + extern occa::kernel gatherScatterKernel_intMax; + + extern occa::kernel gatherScatterKernel_longAdd; + extern occa::kernel gatherScatterKernel_longMul; + extern occa::kernel gatherScatterKernel_longMin; + extern occa::kernel gatherScatterKernel_longMax; + + + + extern occa::kernel gatherScatterVecKernel_floatAdd; + extern occa::kernel gatherScatterVecKernel_floatMul; + extern occa::kernel gatherScatterVecKernel_floatMin; + extern occa::kernel gatherScatterVecKernel_floatMax; + + extern occa::kernel gatherScatterVecKernel_doubleAdd; + extern occa::kernel gatherScatterVecKernel_doubleMul; + extern occa::kernel gatherScatterVecKernel_doubleMin; + extern occa::kernel gatherScatterVecKernel_doubleMax; + + extern occa::kernel gatherScatterVecKernel_intAdd; + extern occa::kernel gatherScatterVecKernel_intMul; + extern occa::kernel gatherScatterVecKernel_intMin; + extern occa::kernel gatherScatterVecKernel_intMax; + + extern occa::kernel gatherScatterVecKernel_longAdd; + extern occa::kernel gatherScatterVecKernel_longMul; + extern occa::kernel gatherScatterVecKernel_longMin; + extern occa::kernel gatherScatterVecKernel_longMax; + + + + extern occa::kernel gatherScatterManyKernel_floatAdd; + extern occa::kernel gatherScatterManyKernel_floatMul; + extern occa::kernel gatherScatterManyKernel_floatMin; + extern occa::kernel gatherScatterManyKernel_floatMax; + + extern occa::kernel gatherScatterManyKernel_doubleAdd; + extern occa::kernel gatherScatterManyKernel_doubleMul; + extern occa::kernel gatherScatterManyKernel_doubleMin; + extern occa::kernel gatherScatterManyKernel_doubleMax; + + extern occa::kernel gatherScatterManyKernel_intAdd; + extern occa::kernel gatherScatterManyKernel_intMul; + extern occa::kernel gatherScatterManyKernel_intMin; + extern occa::kernel gatherScatterManyKernel_intMax; + + extern occa::kernel gatherScatterManyKernel_longAdd; + extern occa::kernel gatherScatterManyKernel_longMul; + extern occa::kernel gatherScatterManyKernel_longMin; + extern occa::kernel gatherScatterManyKernel_longMax; + + + + extern occa::kernel gatherKernel_floatAdd; + extern occa::kernel gatherKernel_floatMul; + extern occa::kernel gatherKernel_floatMin; + extern occa::kernel gatherKernel_floatMax; + + extern occa::kernel gatherKernel_doubleAdd; + extern occa::kernel gatherKernel_doubleMul; + extern occa::kernel gatherKernel_doubleMin; + extern occa::kernel gatherKernel_doubleMax; + + extern occa::kernel gatherKernel_intAdd; + extern occa::kernel gatherKernel_intMul; + extern occa::kernel gatherKernel_intMin; + extern occa::kernel gatherKernel_intMax; + + extern occa::kernel gatherKernel_longAdd; + extern occa::kernel gatherKernel_longMul; + extern occa::kernel gatherKernel_longMin; + extern occa::kernel gatherKernel_longMax; + + + + extern occa::kernel gatherVecKernel_floatAdd; + extern occa::kernel gatherVecKernel_floatMul; + extern occa::kernel gatherVecKernel_floatMin; + extern occa::kernel gatherVecKernel_floatMax; + + extern occa::kernel gatherVecKernel_doubleAdd; + extern occa::kernel gatherVecKernel_doubleMul; + extern occa::kernel gatherVecKernel_doubleMin; + extern occa::kernel gatherVecKernel_doubleMax; + + extern occa::kernel gatherVecKernel_intAdd; + extern occa::kernel gatherVecKernel_intMul; + extern occa::kernel gatherVecKernel_intMin; + extern occa::kernel gatherVecKernel_intMax; + + extern occa::kernel gatherVecKernel_longAdd; + extern occa::kernel gatherVecKernel_longMul; + extern occa::kernel gatherVecKernel_longMin; + extern occa::kernel gatherVecKernel_longMax; + + + + extern occa::kernel gatherManyKernel_floatAdd; + extern occa::kernel gatherManyKernel_floatMul; + extern occa::kernel gatherManyKernel_floatMin; + extern occa::kernel gatherManyKernel_floatMax; + + extern occa::kernel gatherManyKernel_doubleAdd; + extern occa::kernel gatherManyKernel_doubleMul; + extern occa::kernel gatherManyKernel_doubleMin; + extern occa::kernel gatherManyKernel_doubleMax; + + extern occa::kernel gatherManyKernel_intAdd; + extern occa::kernel gatherManyKernel_intMul; + extern occa::kernel gatherManyKernel_intMin; + extern occa::kernel gatherManyKernel_intMax; + + extern occa::kernel gatherManyKernel_longAdd; + extern occa::kernel gatherManyKernel_longMul; + extern occa::kernel gatherManyKernel_longMin; + extern occa::kernel gatherManyKernel_longMax; + + + extern occa::kernel scatterKernel_float; + extern occa::kernel scatterKernel_double; + extern occa::kernel scatterKernel_int; + extern occa::kernel scatterKernel_long; + + extern occa::kernel scatterVecKernel_float; + extern occa::kernel scatterVecKernel_double; + extern occa::kernel scatterVecKernel_int; + extern occa::kernel scatterVecKernel_long; + + extern occa::kernel scatterManyKernel_float; + extern occa::kernel scatterManyKernel_double; + extern occa::kernel scatterManyKernel_int; + extern occa::kernel scatterManyKernel_long; + + extern occa::stream defaultStream; + extern occa::stream dataStream; + + void initKernels(MPI_Comm comm, occa::device device); + + void freeKernels(); +} + +void occaGatherScatter(const dlong Ngather, + occa::memory o_gatherStarts, + occa::memory o_gatherIds, + const char* type, + const char* op, + occa::memory o_v); + +void occaGatherScatterVec(const dlong Ngather, + const int Nentries, + occa::memory o_gatherStarts, + occa::memory o_gatherIds, + const char* type, + const char* op, + occa::memory o_v); + +void occaGatherScatterMany(const dlong Ngather, + const int Nentries, + const dlong stride, + occa::memory o_gatherStarts, + occa::memory o_gatherIds, + const char* type, + const char* op, + occa::memory o_v); + +void occaGather(const dlong Ngather, + occa::memory o_gatherStarts, + occa::memory o_gatherIds, + const char* type, + const char* op, + occa::memory o_v, + occa::memory o_gv); + +void occaGatherVec(const dlong Ngather, + const int Nentries, + occa::memory o_gatherStarts, + occa::memory o_gatherIds, + const char* type, + const char* op, + occa::memory o_v, + occa::memory o_gv); + +void occaGatherMany(const dlong Ngather, + const int Nentries, + const dlong stride, + const dlong gstride, + occa::memory o_gatherStarts, + occa::memory o_gatherIds, + const char* type, + const char* op, + occa::memory o_v, + occa::memory o_gv); + +void occaScatter(const dlong Nscatter, + occa::memory o_scatterStarts, + occa::memory o_scatterIds, + const char* type, + const char* op, + occa::memory o_v, + occa::memory o_sv); + +void occaScatterVec(const dlong Nscatter, + const int Nentries, + occa::memory o_scatterStarts, + occa::memory o_scatterIds, + const char* type, + const char* op, + occa::memory o_v, + occa::memory o_sv); + +void occaScatterMany(const dlong Nscatter, + const int Nentries, + const dlong stride, + const dlong sstride, + occa::memory o_scatterStarts, + occa::memory o_scatterIds, + const char* type, + const char* op, + occa::memory o_v, + occa::memory o_sv); + +#endif \ No newline at end of file diff --git a/solvers/elliptic/src/ellipticParallelGatherScatter.c b/libs/gatherScatter/include/scatter.tpp similarity index 67% rename from solvers/elliptic/src/ellipticParallelGatherScatter.c rename to libs/gatherScatter/include/scatter.tpp index e2e05af0e..6595a5b71 100644 --- a/solvers/elliptic/src/ellipticParallelGatherScatter.c +++ b/libs/gatherScatter/include/scatter.tpp @@ -24,12 +24,31 @@ SOFTWARE. */ -#include "elliptic.h" -void ellipticParallelGatherScatter(mesh_t *mesh, ogs_t *ogs, occa::memory &o_q, const char *type, const char *op){ - - // use gather map for gather and scatter - occaTimerTic(mesh->device,"meshParallelGatherScatter"); - meshParallelGatherScatter(mesh, ogs, o_q); - occaTimerToc(mesh->device,"meshParallelGatherScatter"); +#ifndef OGS_SCATTER_TPP +#define OGS_SCATTER_TPP 1 + +#include "ogs.hpp" + +template +void scatter(const dlong Nscatter, + const dlong * scatterStarts, + const dlong * scatterIds, + const T * q, + T * scatterq) { + + for(dlong s=0;s +void scatterMany(const dlong Nscatter, + const int Nentries, + const dlong stride, + const dlong sstride, + const dlong * scatterStarts, + const dlong * scatterIds, + const T * q, + T * scatterq) { + + for(int k=0;k +void scatterVec(const dlong Nscatter, + const int Nentries, + const dlong * scatterStarts, + const dlong * scatterIds, + const T * q, + T * scatterq) { - int NsendTotal; // number of nodes to send - int NrecvTotal; // number of nodes to recv - int haloOffset; - int *Nsend; - int *Nrecv; - dfloat *sendBuffer; - dfloat *recvBuffer; - void *haloSendRequests; - void *haloRecvRequests; - -}hgs_t; + for(dlong s=0;s +#include +#include + +#include "mpi.h" +#include "types.h" + +#define ogsFloat "float" +#define ogsDouble "double" +#define ogsDfloat dfloatString +#define ogsInt "int" +#define ogsLong "long long int" +#define ogsDlong dlongString +#define ogsHlong hlongString + +#define ogsAdd "add" +#define ogsMul "mul" +#define ogsMax "max" +#define ogsMin "min" + +// OCCA+gslib gather scatter +typedef struct { + + MPI_Comm comm; + occa::device device; + + dlong N; + dlong Ngather; // total number of gather nodes + dlong Nlocal; // number of local nodes + dlong NlocalGather; // number of local gathered nodes + dlong Nhalo; // number of halo nodes + dlong NhaloGather; // number of gathered nodes on halo + dlong NownedHalo; // number of owned halo nodes + + dlong *localGatherOffsets; + dlong *localGatherIds; + occa::memory o_localGatherOffsets; + occa::memory o_localGatherIds; + + dlong *haloGatherOffsets; + dlong *haloGatherIds; + occa::memory o_haloGatherOffsets; + occa::memory o_haloGatherIds; + + void *hostGsh; // gslib gather + void *haloGshSym; // gslib gather + void *haloGshNonSym; // gslib gather + + //degree vectors + dfloat *invDegree, *gatherInvDegree; + occa::memory o_invDegree; + occa::memory o_gatherInvDegree; + +}ogs_t; + + +ogs_t *ogsSetup(dlong N, hlong *ids, MPI_Comm &comm, + int verbose, occa::device device); + +void ogsFree(ogs_t* ogs); + +// Host array versions +void ogsGatherScatter (void *v, const char *type, const char *op, ogs_t *ogs); //wrapper for gslib call +void ogsGatherScatterVec (void *v, const int k, const char *type, const char *op, ogs_t *ogs); //wrapper for gslib call +void ogsGatherScatterMany(void *v, const int k, const dlong stride, const char *type, const char *op, ogs_t *ogs); //wrapper for gslib call + +void ogsGather (void *gv, void *v, const char *type, const char *op, ogs_t *ogs); +void ogsGatherVec (void *gv, void *v, const int k, const char *type, const char *op, ogs_t *ogs); +void ogsGatherMany(void *gv, void *v, const int k, const dlong stride, const char *type, const char *op, ogs_t *ogs); + +void ogsScatter (void *sv, void *v, const char *type, const char *op, ogs_t *ogs); +void ogsScatterVec (void *sv, void *v, const int k, const char *type, const char *op, ogs_t *ogs); +void ogsScatterMany(void *sv, void *v, const int k, const dlong stride, const char *type, const char *op, ogs_t *ogs); + + +// Synchronous device buffer versions +void ogsGatherScatter (occa::memory o_v, const char *type, const char *op, ogs_t *ogs); //wrapper for gslib call +void ogsGatherScatterVec (occa::memory o_v, const int k, const char *type, const char *op, ogs_t *ogs); //wrapper for gslib call +void ogsGatherScatterMany(occa::memory o_v, const int k, const dlong stride, const char *type, const char *op, ogs_t *ogs); //wrapper for gslib call + +void ogsGather (occa::memory o_gv, occa::memory o_v, const char *type, const char *op, ogs_t *ogs); +void ogsGatherVec (occa::memory o_gv, occa::memory o_v, const int k, const char *type, const char *op, ogs_t *ogs); +void ogsGatherMany(occa::memory o_gv, occa::memory o_v, const int k, const dlong stride, const char *type, const char *op, ogs_t *ogs); + +void ogsScatter (occa::memory o_sv, occa::memory o_v, const char *type, const char *op, ogs_t *ogs); +void ogsScatterVec (occa::memory o_sv, occa::memory o_v, const int k, const char *type, const char *op, ogs_t *ogs); +void ogsScatterMany(occa::memory o_sv, occa::memory o_v, const int k, const dlong stride, const char *type, const char *op, ogs_t *ogs); + +// Asynchronous device buffer versions +void ogsGatherScatterStart (occa::memory o_v, const char *type, const char *op, ogs_t *ogs); +void ogsGatherScatterFinish (occa::memory o_v, const char *type, const char *op, ogs_t *ogs); +void ogsGatherScatterVecStart (occa::memory o_v, const int k, const char *type, const char *op, ogs_t *ogs); +void ogsGatherScatterVecFinish (occa::memory o_v, const int k, const char *type, const char *op, ogs_t *ogs); +void ogsGatherScatterManyStart (occa::memory o_v, const int k, const dlong stride, const char *type, const char *op, ogs_t *ogs); +void ogsGatherScatterManyFinish(occa::memory o_v, const int k, const dlong stride, const char *type, const char *op, ogs_t *ogs); + +void ogsGatherStart (occa::memory o_Gv, occa::memory o_v, const char *type, const char *op, ogs_t *ogs); +void ogsGatherFinish (occa::memory o_Gv, occa::memory o_v, const char *type, const char *op, ogs_t *ogs); +void ogsGatherVecStart (occa::memory o_Gv, occa::memory o_v, const int k, const char *type, const char *op, ogs_t *ogs); +void ogsGatherVecFinish (occa::memory o_Gv, occa::memory o_v, const int k, const char *type, const char *op, ogs_t *ogs); +void ogsGatherManyStart (occa::memory o_Gv, occa::memory o_v, const int k, const dlong gstride, const dlong stride, const char *type, const char *op, ogs_t *ogs); +void ogsGatherManyFinish(occa::memory o_Gv, occa::memory o_v, const int k, const dlong gstride, const dlong stride, const char *type, const char *op, ogs_t *ogs); + +void ogsScatterStart (occa::memory o_Sv, occa::memory o_v, const char *type, const char *op, ogs_t *ogs); +void ogsScatterFinish (occa::memory o_Sv, occa::memory o_v, const char *type, const char *op, ogs_t *ogs); +void ogsScatterVecStart (occa::memory o_Sv, occa::memory o_v, const int k, const char *type, const char *op, ogs_t *ogs); +void ogsScatterVecFinish (occa::memory o_Sv, occa::memory o_v, const int k, const char *type, const char *op, ogs_t *ogs); +void ogsScatterManyStart (occa::memory o_Sv, occa::memory o_v, const int k, const dlong sstride, const dlong stride, const char *type, const char *op, ogs_t *ogs); +void ogsScatterManyFinish(occa::memory o_Sv, occa::memory o_v, const int k, const dlong sstride, const dlong stride, const char *type, const char *op, ogs_t *ogs); + + +#endif \ No newline at end of file diff --git a/libs/gatherScatter/okl/gather.okl b/libs/gatherScatter/okl/gather.okl new file mode 100644 index 000000000..8fec01bb6 --- /dev/null +++ b/libs/gatherScatter/okl/gather.okl @@ -0,0 +1,390 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + + +@kernel void gather_floatAdd(const dlong Ngather, + @restrict const dlong * gatherStarts, + @restrict const dlong * gatherIds, + @restrict const float * q, + @restrict float * gatherq){ + + for(dlong g=0;g gq) ? q[id] : gq; + } + + //contiguously packed + gatherq[g] = gq; + } +} + +@kernel void gather_doubleMax(const dlong Ngather, + @restrict const dlong * gatherStarts, + @restrict const dlong * gatherIds, + @restrict const double * q, + @restrict double * gatherq){ + + for(dlong g=0;g gq) ? q[id] : gq; + } + + //contiguously packed + gatherq[g] = gq; + } +} + +@kernel void gather_intMax(const dlong Ngather, + @restrict const dlong * gatherStarts, + @restrict const dlong * gatherIds, + @restrict const int * q, + @restrict int * gatherq){ + + for(dlong g=0;g gq) ? q[id] : gq; + } + + //contiguously packed + gatherq[g] = gq; + } +} + +@kernel void gather_longMax(const dlong Ngather, + @restrict const dlong * gatherStarts, + @restrict const dlong * gatherIds, + @restrict const long long int * q, + @restrict long long int * gatherq){ + + for(dlong g=0;g gq) ? q[id] : gq; + } + + //contiguously packed + gatherq[g] = gq; + } +} \ No newline at end of file diff --git a/libs/gatherScatter/okl/gatherMany.okl b/libs/gatherScatter/okl/gatherMany.okl new file mode 100644 index 000000000..41dfd133b --- /dev/null +++ b/libs/gatherScatter/okl/gatherMany.okl @@ -0,0 +1,470 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + + +@kernel void gatherMany_floatAdd(const dlong Ngather, + const int Nentries, + const dlong stride, + const dlong gstride, + @restrict const dlong * gatherStarts, + @restrict const dlong * gatherIds, + @restrict const float * q, + @restrict float * gatherq){ + + for(dlong g=0;g gq) ? q[id+k*stride] : gq; + } + + //contiguously packed + gatherq[g+k*gstride] = gq; + } +} + +@kernel void gatherMany_doubleMax(const dlong Ngather, + const int Nentries, + const dlong stride, + const dlong gstride, + @restrict const dlong * gatherStarts, + @restrict const dlong * gatherIds, + @restrict const double * q, + @restrict double * gatherq){ + + for(dlong g=0;g gq) ? q[id+k*stride] : gq; + } + + //contiguously packed + gatherq[g+k*gstride] = gq; + } +} + +@kernel void gatherMany_intMax(const dlong Ngather, + const int Nentries, + const dlong stride, + const dlong gstride, + @restrict const dlong * gatherStarts, + @restrict const dlong * gatherIds, + @restrict const int * q, + @restrict int * gatherq){ + + for(dlong g=0;g gq) ? q[id+k*stride] : gq; + } + + //contiguously packed + gatherq[g+k*gstride] = gq; + } +} + +@kernel void gatherMany_longMax(const dlong Ngather, + const int Nentries, + const dlong stride, + const dlong gstride, + @restrict const dlong * gatherStarts, + @restrict const dlong * gatherIds, + @restrict const long long int * q, + @restrict long long int * gatherq){ + + for(dlong g=0;g gq) ? q[id+k*stride] : gq; + } + + //contiguously packed + gatherq[g+k*gstride] = gq; + } +} \ No newline at end of file diff --git a/libs/gatherScatter/okl/gatherScatter.okl b/libs/gatherScatter/okl/gatherScatter.okl new file mode 100644 index 000000000..40c4e9b87 --- /dev/null +++ b/libs/gatherScatter/okl/gatherScatter.okl @@ -0,0 +1,451 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + + +@kernel void gatherScatter_floatAdd(const dlong Ngather, + @restrict const dlong * gatherStarts, + @restrict const dlong * gatherIds, + @restrict float * q){ + + for(dlong g=0;g gq) ? q[id] : gq; + } + + for(dlong n=start;n gq) ? q[id] : gq; + } + + for(dlong n=start;n gq) ? q[id] : gq; + } + + for(dlong n=start;n gq) ? q[id] : gq; + } + + for(dlong n=start;n gq) ? q[id+k*stride] : gq; + } + + for(dlong n=start;n gq) ? q[id+k*stride] : gq; + } + + for(dlong n=start;n gq) ? q[id+k*stride] : gq; + } + + for(dlong n=start;n gq) ? q[id+k*stride] : gq; + } + + for(dlong n=start;n gq) ? q[id*Nentries+k] : gq; + } + + for(dlong n=start;n gq) ? q[id*Nentries+k] : gq; + } + + for(dlong n=start;n gq) ? q[id*Nentries+k] : gq; + } + + for(dlong n=start;n gq) ? q[id*Nentries+k] : gq; + } + + for(dlong n=start;n gq) ? q[id*Nentries+k] : gq; + } + + //contiguously packed + gatherq[g] = gq; + } +} + +@kernel void gatherVec_doubleMax(const dlong Ngather, + const int Nentries, + @restrict const dlong * gatherStarts, + @restrict const dlong * gatherIds, + @restrict const double * q, + @restrict double * gatherq){ + + for(dlong g=0;g gq) ? q[id*Nentries+k] : gq; + } + + //contiguously packed + gatherq[g] = gq; + } +} + +@kernel void gatherVec_intMax(const dlong Ngather, + const int Nentries, + @restrict const dlong * gatherStarts, + @restrict const dlong * gatherIds, + @restrict const int * q, + @restrict int * gatherq){ + + for(dlong g=0;g gq) ? q[id*Nentries+k] : gq; + } + + //contiguously packed + gatherq[g] = gq; + } +} + +@kernel void gatherVec_longMax(const dlong Ngather, + const int Nentries, + @restrict const dlong * gatherStarts, + @restrict const dlong * gatherIds, + @restrict const long long int * q, + @restrict long long int * gatherq){ + + for(dlong g=0;g gq) ? q[id*Nentries+k] : gq; + } + + //contiguously packed + gatherq[g] = gq; + } +} \ No newline at end of file diff --git a/libs/gatherScatter/okl/scatter.okl b/libs/gatherScatter/okl/scatter.okl new file mode 100644 index 000000000..336a2a25e --- /dev/null +++ b/libs/gatherScatter/okl/scatter.okl @@ -0,0 +1,106 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + + +@kernel void scatter_float(const dlong Nscatter, + @restrict const dlong * scatterStarts, + @restrict const dlong * scatterIds, + @restrict const float * q, + @restrict float * scatterq){ + + for(dlong s=0;sNhaloGather) { + if (ogs::o_haloBuf.size() < ogs->NhaloGather*Nbytes) { + if (ogs::o_haloBuf.size()) ogs::o_haloBuf.free(); + ogs::o_haloBuf = ogs->device.mappedAlloc(ogs->NhaloGather*Nbytes); + ogs::haloBuf = ogs::o_haloBuf.getMappedPointer(); + } + } + + // gather halo nodes on device + if (ogs->NhaloGather) { + occaGather(ogs->NhaloGather, ogs->o_haloGatherOffsets, ogs->o_haloGatherIds, type, op, o_v, ogs::o_haloBuf); + + ogs->device.finish(); + ogs->device.setStream(ogs::dataStream); + ogs::o_haloBuf.copyTo(ogs::haloBuf, ogs->NhaloGather*Nbytes, 0, "async: true"); + ogs->device.setStream(ogs::defaultStream); + } +} + + +void ogsGatherFinish(occa::memory o_gv, + occa::memory o_v, + const char *type, + const char *op, + ogs_t *ogs){ + size_t Nbytes; + if (!strcmp(type, "float")) + Nbytes = sizeof(float); + else if (!strcmp(type, "double")) + Nbytes = sizeof(double); + else if (!strcmp(type, "int")) + Nbytes = sizeof(int); + else if (!strcmp(type, "long long int")) + Nbytes = sizeof(long long int); + + if(ogs->NlocalGather) { + occaGather(ogs->NlocalGather, ogs->o_localGatherOffsets, ogs->o_localGatherIds, type, op, o_v, o_gv); + } + + if (ogs->NhaloGather) { + ogs->device.setStream(ogs::dataStream); + ogs->device.finish(); + + // MPI based gather using libgs + ogsHostGather(ogs::haloBuf, type, op, ogs->haloGshNonSym); + + // copy totally gather halo data back from HOST to DEVICE + if (ogs->NownedHalo) + o_gv.copyFrom(ogs::haloBuf, ogs->NownedHalo*Nbytes, + ogs->NlocalGather*Nbytes, "async: true"); + + ogs->device.finish(); + ogs->device.setStream(ogs::defaultStream); + } +} + +void ogsGather(void *gv, + void *v, + const char *type, + const char *op, + ogs_t *ogs){ + size_t Nbytes; + if (!strcmp(type, "float")) + Nbytes = sizeof(float); + else if (!strcmp(type, "double")) + Nbytes = sizeof(double); + else if (!strcmp(type, "int")) + Nbytes = sizeof(int); + else if (!strcmp(type, "long long int")) + Nbytes = sizeof(long long int); + + if (ogs->NhaloGather) { + if (ogs::hostBufSize < ogs->NhaloGather*Nbytes) { + if (ogs::hostBufSize) free(ogs::hostBuf); + ogs::hostBuf = (void *) malloc(ogs->NhaloGather*Nbytes); + } + } + + if (!strcmp(op, "add")) + ogsGather_add(gv, v, Nbytes, type, ogs); + else if (!strcmp(op, "mul")) + ogsGather_mul(gv, v, Nbytes, type, ogs); + else if (!strcmp(op, "min")) + ogsGather_min(gv, v, Nbytes, type, ogs); + else if (!strcmp(op, "max")) + ogsGather_max(gv, v, Nbytes, type, ogs); +} + +void ogsGather_add(void *gv, void *v, const size_t Nbytes, const char *type, ogs_t *ogs){ + + if (!strcmp(type, "float")) + gather_add(ogs->NhaloGather, ogs->haloGatherOffsets, + ogs->haloGatherIds, (float*)v, (float*)ogs::hostBuf); + else if (!strcmp(type, "double")) + gather_add(ogs->NhaloGather, ogs->haloGatherOffsets, + ogs->haloGatherIds, (double*)v, (double*)ogs::hostBuf); + else if (!strcmp(type, "int")) + gather_add(ogs->NhaloGather, ogs->haloGatherOffsets, + ogs->haloGatherIds, (int*)v, (int*)ogs::hostBuf); + else if (!strcmp(type, "long long int")) + gather_add(ogs->NhaloGather, ogs->haloGatherOffsets, + ogs->haloGatherIds, (long long int*)v, (long long int*)ogs::hostBuf); + + if (ogs->NhaloGather) { + // MPI based scatter using gslib + ogsHostGather(ogs::hostBuf, type, ogsAdd, ogs->haloGshNonSym); + + if (ogs->NownedHalo) + memcpy((char*)gv+ogs->NlocalGather*Nbytes, ogs::hostBuf, ogs->NownedHalo*Nbytes); + } + + if (!strcmp(type, "float")) + gather_add(ogs->NlocalGather, ogs->localGatherOffsets, + ogs->localGatherIds, (float*)v, (float*)gv); + else if (!strcmp(type, "double")) + gather_add(ogs->NlocalGather, ogs->localGatherOffsets, + ogs->localGatherIds, (double*)v, (double*)gv); + else if (!strcmp(type, "int")) + gather_add(ogs->NlocalGather, ogs->localGatherOffsets, + ogs->localGatherIds, (int*)v, (int*)gv); + else if (!strcmp(type, "long long int")) + gather_add(ogs->NlocalGather, ogs->localGatherOffsets, + ogs->localGatherIds, (long long int*)v, (long long int*)gv); +} + +void ogsGather_mul(void *gv, void *v, const size_t Nbytes, const char *type, ogs_t *ogs){ + + if (!strcmp(type, "float")) + gather_mul(ogs->NhaloGather, ogs->haloGatherOffsets, + ogs->haloGatherIds, (float*)v, (float*)ogs::hostBuf); + else if (!strcmp(type, "double")) + gather_mul(ogs->NhaloGather, ogs->haloGatherOffsets, + ogs->haloGatherIds, (double*)v, (double*)ogs::hostBuf); + else if (!strcmp(type, "int")) + gather_mul(ogs->NhaloGather, ogs->haloGatherOffsets, + ogs->haloGatherIds, (int*)v, (int*)ogs::hostBuf); + else if (!strcmp(type, "long long int")) + gather_mul(ogs->NhaloGather, ogs->haloGatherOffsets, + ogs->haloGatherIds, (long long int*)v, (long long int*)ogs::hostBuf); + + if (ogs->NhaloGather) { + // MPI based scatter using gslib + ogsHostGather(ogs::hostBuf, type, ogsMul, ogs->haloGshNonSym); + + if (ogs->NownedHalo) + memcpy((char*)gv+ogs->NlocalGather*Nbytes, ogs::hostBuf, ogs->NownedHalo*Nbytes); + } + + if (!strcmp(type, "float")) + gather_mul(ogs->NlocalGather, ogs->localGatherOffsets, + ogs->localGatherIds, (float*)v, (float*)gv); + else if (!strcmp(type, "double")) + gather_mul(ogs->NlocalGather, ogs->localGatherOffsets, + ogs->localGatherIds, (double*)v, (double*)gv); + else if (!strcmp(type, "int")) + gather_mul(ogs->NlocalGather, ogs->localGatherOffsets, + ogs->localGatherIds, (int*)v, (int*)gv); + else if (!strcmp(type, "long long int")) + gather_mul(ogs->NlocalGather, ogs->localGatherOffsets, + ogs->localGatherIds, (long long int*)v, (long long int*)gv); +} + +void ogsGather_min(void *gv, void *v, const size_t Nbytes, const char *type, ogs_t *ogs){ + + if (!strcmp(type, "float")) + gather_min(ogs->NhaloGather, ogs->haloGatherOffsets, + ogs->haloGatherIds, (float*)v, (float*)ogs::hostBuf); + else if (!strcmp(type, "double")) + gather_min(ogs->NhaloGather, ogs->haloGatherOffsets, + ogs->haloGatherIds, (double*)v, (double*)ogs::hostBuf); + else if (!strcmp(type, "int")) + gather_min(ogs->NhaloGather, ogs->haloGatherOffsets, + ogs->haloGatherIds, (int*)v, (int*)ogs::hostBuf); + else if (!strcmp(type, "long long int")) + gather_min(ogs->NhaloGather, ogs->haloGatherOffsets, + ogs->haloGatherIds, (long long int*)v, (long long int*)ogs::hostBuf); + + if (ogs->NhaloGather) { + // MPI based scatter using gslib + ogsHostGather(ogs::hostBuf, type, ogsMin, ogs->haloGshNonSym); + + if (ogs->NownedHalo) + memcpy((char*)gv+ogs->NlocalGather*Nbytes, ogs::hostBuf, ogs->NownedHalo*Nbytes); + } + + if (!strcmp(type, "float")) + gather_min(ogs->NlocalGather, ogs->localGatherOffsets, + ogs->localGatherIds, (float*)v, (float*)gv); + else if (!strcmp(type, "double")) + gather_min(ogs->NlocalGather, ogs->localGatherOffsets, + ogs->localGatherIds, (double*)v, (double*)gv); + else if (!strcmp(type, "int")) + gather_min(ogs->NlocalGather, ogs->localGatherOffsets, + ogs->localGatherIds, (int*)v, (int*)gv); + else if (!strcmp(type, "long long int")) + gather_min(ogs->NlocalGather, ogs->localGatherOffsets, + ogs->localGatherIds, (long long int*)v, (long long int*)gv); +} + +void ogsGather_max(void *gv, void *v, const size_t Nbytes, const char *type, ogs_t *ogs){ + + if (!strcmp(type, "float")) + gather_max(ogs->NhaloGather, ogs->haloGatherOffsets, + ogs->haloGatherIds, (float*)v, (float*)ogs::hostBuf); + else if (!strcmp(type, "double")) + gather_max(ogs->NhaloGather, ogs->haloGatherOffsets, + ogs->haloGatherIds, (double*)v, (double*)ogs::hostBuf); + else if (!strcmp(type, "int")) + gather_max(ogs->NhaloGather, ogs->haloGatherOffsets, + ogs->haloGatherIds, (int*)v, (int*)ogs::hostBuf); + else if (!strcmp(type, "long long int")) + gather_max(ogs->NhaloGather, ogs->haloGatherOffsets, + ogs->haloGatherIds, (long long int*)v, (long long int*)ogs::hostBuf); + + if (ogs->NhaloGather) { + // MPI based scatter using gslib + ogsHostGather(ogs::hostBuf, type, ogsMax, ogs->haloGshNonSym); + + if (ogs->NownedHalo) + memcpy((char*)gv+ogs->NlocalGather*Nbytes, ogs::hostBuf, ogs->NownedHalo*Nbytes); + } + + if (!strcmp(type, "float")) + gather_max(ogs->NlocalGather, ogs->localGatherOffsets, + ogs->localGatherIds, (float*)v, (float*)gv); + else if (!strcmp(type, "double")) + gather_max(ogs->NlocalGather, ogs->localGatherOffsets, + ogs->localGatherIds, (double*)v, (double*)gv); + else if (!strcmp(type, "int")) + gather_max(ogs->NlocalGather, ogs->localGatherOffsets, + ogs->localGatherIds, (int*)v, (int*)gv); + else if (!strcmp(type, "long long int")) + gather_max(ogs->NlocalGather, ogs->localGatherOffsets, + ogs->localGatherIds, (long long int*)v, (long long int*)gv); +} + +void occaGather(const dlong Ngather, + occa::memory o_gatherStarts, + occa::memory o_gatherIds, + const char* type, + const char* op, + occa::memory o_v, + occa::memory o_gv) { + + if ((!strcmp(type, "float"))&&(!strcmp(op, "add"))) + ogs::gatherKernel_floatAdd(Ngather, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "float"))&&(!strcmp(op, "mul"))) + ogs::gatherKernel_floatMul(Ngather, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "float"))&&(!strcmp(op, "min"))) + ogs::gatherKernel_floatMin(Ngather, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "float"))&&(!strcmp(op, "max"))) + ogs::gatherKernel_floatMax(Ngather, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "double"))&&(!strcmp(op, "add"))) + ogs::gatherKernel_doubleAdd(Ngather, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "double"))&&(!strcmp(op, "mul"))) + ogs::gatherKernel_doubleMul(Ngather, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "double"))&&(!strcmp(op, "min"))) + ogs::gatherKernel_doubleMin(Ngather, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "double"))&&(!strcmp(op, "max"))) + ogs::gatherKernel_doubleMax(Ngather, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "int"))&&(!strcmp(op, "add"))) + ogs::gatherKernel_intAdd(Ngather, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "int"))&&(!strcmp(op, "mul"))) + ogs::gatherKernel_intMul(Ngather, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "int"))&&(!strcmp(op, "min"))) + ogs::gatherKernel_intMin(Ngather, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "int"))&&(!strcmp(op, "max"))) + ogs::gatherKernel_intMax(Ngather, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "long long int"))&&(!strcmp(op, "add"))) + ogs::gatherKernel_longAdd(Ngather, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "long long int"))&&(!strcmp(op, "mul"))) + ogs::gatherKernel_longMul(Ngather, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "long long int"))&&(!strcmp(op, "min"))) + ogs::gatherKernel_longMin(Ngather, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "long long int"))&&(!strcmp(op, "max"))) + ogs::gatherKernel_longMax(Ngather, o_gatherStarts, o_gatherIds, o_v, o_gv); +} \ No newline at end of file diff --git a/libs/gatherScatter/src/ogsGatherMany.cpp b/libs/gatherScatter/src/ogsGatherMany.cpp new file mode 100644 index 000000000..491b42b3e --- /dev/null +++ b/libs/gatherScatter/src/ogsGatherMany.cpp @@ -0,0 +1,410 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "ogs.hpp" +#include "ogsKernels.hpp" +#include "ogsInterface.h" + +#include "gatherMany.tpp" + +void ogsGatherMany_add(void *gv, void *v, const int k, const dlong gstride, const dlong stride, const size_t Nbytes, const char *type, ogs_t *ogs); +void ogsGatherMany_mul(void *gv, void *v, const int k, const dlong gstride, const dlong stride, const size_t Nbytes, const char *type, ogs_t *ogs); +void ogsGatherMany_min(void *gv, void *v, const int k, const dlong gstride, const dlong stride, const size_t Nbytes, const char *type, ogs_t *ogs); +void ogsGatherMany_max(void *gv, void *v, const int k, const dlong gstride, const dlong stride, const size_t Nbytes, const char *type, ogs_t *ogs); + +void ogsGatherMany(occa::memory o_gv, + occa::memory o_v, + const int k, + const dlong gstride, + const dlong stride, + const char *type, + const char *op, + ogs_t *ogs){ + ogsGatherManyStart (o_gv, o_v, k, gstride, stride, type, op, ogs); + ogsGatherManyFinish(o_gv, o_v, k, gstride, stride, type, op, ogs); +} + +void ogsGatherManyStart(occa::memory o_gv, + occa::memory o_v, + const int k, + const dlong gstride, + const dlong stride, + const char *type, + const char *op, + ogs_t *ogs){ + size_t Nbytes; + if (!strcmp(type, "float")) + Nbytes = sizeof(float); + else if (!strcmp(type, "double")) + Nbytes = sizeof(double); + else if (!strcmp(type, "int")) + Nbytes = sizeof(int); + else if (!strcmp(type, "long long int")) + Nbytes = sizeof(long long int); + + if (ogs->NhaloGather) { + if (ogs::o_haloBuf.size() < ogs->NhaloGather*Nbytes*k) { + if (ogs::o_haloBuf.size()) ogs::o_haloBuf.free(); + ogs::o_haloBuf = ogs->device.mappedAlloc(ogs->NhaloGather*Nbytes*k); + ogs::haloBuf = ogs::o_haloBuf.getMappedPointer(); + } + } + + // gather halo nodes on device + if (ogs->NhaloGather) { + occaGatherMany(ogs->NhaloGather, k, stride, ogs->NhaloGather, ogs->o_haloGatherOffsets, ogs->o_haloGatherIds, type, op, o_v, ogs::o_haloBuf); + + ogs->device.finish(); + ogs->device.setStream(ogs::dataStream); + ogs::o_haloBuf.copyTo(ogs::haloBuf, ogs->NhaloGather*Nbytes*k, 0, "async: true"); + ogs->device.setStream(ogs::defaultStream); + } +} + + +void ogsGatherManyFinish(occa::memory o_gv, + occa::memory o_v, + const int k, + const dlong gstride, + const dlong stride, + const char *type, + const char *op, + ogs_t *ogs){ + size_t Nbytes; + if (!strcmp(type, "float")) + Nbytes = sizeof(float); + else if (!strcmp(type, "double")) + Nbytes = sizeof(double); + else if (!strcmp(type, "int")) + Nbytes = sizeof(int); + else if (!strcmp(type, "long long int")) + Nbytes = sizeof(long long int); + + if(ogs->NlocalGather) { + occaGatherMany(ogs->NlocalGather, k, stride, gstride, ogs->o_localGatherOffsets, ogs->o_localGatherIds, type, op, o_v, o_gv); + } + + if (ogs->NhaloGather) { + ogs->device.setStream(ogs::dataStream); + ogs->device.finish(); + + void* H[k]; + for (int i=0;iNhaloGather*Nbytes; + + // MPI based gather using libgs + ogsHostGatherMany(H, k, type, op, ogs->haloGshNonSym); + + // copy totally gather halo data back from HOST to DEVICE + if (ogs->NownedHalo) + for (int i=0;iNhaloGather*Nbytes*i, + ogs->NownedHalo*Nbytes, + ogs->NlocalGather*Nbytes*i, "async: true"); + + ogs->device.finish(); + ogs->device.setStream(ogs::defaultStream); + } +} + +void ogsGatherMany(void *gv, + void *v, + const int k, + const dlong gstride, + const dlong stride, + const char *type, + const char *op, + ogs_t *ogs){ + size_t Nbytes; + if (!strcmp(type, "float")) + Nbytes = sizeof(float); + else if (!strcmp(type, "double")) + Nbytes = sizeof(double); + else if (!strcmp(type, "int")) + Nbytes = sizeof(int); + else if (!strcmp(type, "long long int")) + Nbytes = sizeof(long long int); + + if (ogs->NhaloGather) { + if (ogs::hostBufSize < ogs->NhaloGather*Nbytes*k) { + if (ogs::hostBufSize) free(ogs::hostBuf); + ogs::hostBuf = (void *) malloc(ogs->NhaloGather*Nbytes*k); + } + } + + if (!strcmp(op, "add")) + ogsGatherMany_add(gv, v, k, gstride, stride, Nbytes, type, ogs); + else if (!strcmp(op, "mul")) + ogsGatherMany_mul(gv, v, k, gstride, stride, Nbytes, type, ogs); + else if (!strcmp(op, "min")) + ogsGatherMany_min(gv, v, k, gstride, stride, Nbytes, type, ogs); + else if (!strcmp(op, "max")) + ogsGatherMany_max(gv, v, k, gstride, stride, Nbytes, type, ogs); +} + +void ogsGatherMany_add(void *gv, void *v, const int k, const dlong gstride, const dlong stride, const size_t Nbytes, const char *type, ogs_t *ogs){ + + if (!strcmp(type, "float")) + gatherMany_add(ogs->NhaloGather, k, stride, ogs->NhaloGather, + ogs->haloGatherOffsets, + ogs->haloGatherIds, (float*)v, (float*)ogs::hostBuf); + else if (!strcmp(type, "double")) + gatherMany_add(ogs->NhaloGather, k, stride, ogs->NhaloGather, + ogs->haloGatherOffsets, + ogs->haloGatherIds, (double*)v, (double*)ogs::hostBuf); + else if (!strcmp(type, "int")) + gatherMany_add(ogs->NhaloGather, k, stride, ogs->NhaloGather, + ogs->haloGatherOffsets, + ogs->haloGatherIds, (int*)v, (int*)ogs::hostBuf); + else if (!strcmp(type, "long long int")) + gatherMany_add(ogs->NhaloGather, k, stride, ogs->NhaloGather, + ogs->haloGatherOffsets, + ogs->haloGatherIds, (long long int*)v, (long long int*)ogs::hostBuf); + + if (ogs->NhaloGather) { + void* H[k]; + for (int i=0;iNhaloGather*Nbytes; + + ogsHostGatherMany(H, k, type, ogsAdd, ogs->haloGshNonSym); + + if (ogs->NownedHalo) + for (int i=0;iNlocalGather*Nbytes*i, + (char*)ogs::hostBuf+ogs->NhaloGather*Nbytes*i, + ogs->NownedHalo*Nbytes); + } + + if (!strcmp(type, "float")) + gatherMany_add(ogs->NlocalGather, k, stride, gstride, + ogs->localGatherOffsets, + ogs->localGatherIds, (float*)v, (float*)gv); + else if (!strcmp(type, "double")) + gatherMany_add(ogs->NlocalGather, k, stride, gstride, + ogs->localGatherOffsets, + ogs->localGatherIds, (double*)v, (double*)gv); + else if (!strcmp(type, "int")) + gatherMany_add(ogs->NlocalGather, k, stride, gstride, + ogs->localGatherOffsets, + ogs->localGatherIds, (int*)v, (int*)gv); + else if (!strcmp(type, "long long int")) + gatherMany_add(ogs->NlocalGather, k, stride, gstride, + ogs->localGatherOffsets, + ogs->localGatherIds, (long long int*)v, (long long int*)gv); +} + +void ogsGatherMany_mul(void *gv, void *v, const int k, const dlong gstride, const dlong stride, const size_t Nbytes, const char *type, ogs_t *ogs){ + + if (!strcmp(type, "float")) + gatherMany_mul(ogs->NhaloGather, k, stride, ogs->NhaloGather, + ogs->haloGatherOffsets, + ogs->haloGatherIds, (float*)v, (float*)ogs::hostBuf); + else if (!strcmp(type, "double")) + gatherMany_mul(ogs->NhaloGather, k, stride, ogs->NhaloGather, + ogs->haloGatherOffsets, + ogs->haloGatherIds, (double*)v, (double*)ogs::hostBuf); + else if (!strcmp(type, "int")) + gatherMany_mul(ogs->NhaloGather, k, stride, ogs->NhaloGather, + ogs->haloGatherOffsets, + ogs->haloGatherIds, (int*)v, (int*)ogs::hostBuf); + else if (!strcmp(type, "long long int")) + gatherMany_mul(ogs->NhaloGather, k, stride, ogs->NhaloGather, + ogs->haloGatherOffsets, + ogs->haloGatherIds, (long long int*)v, (long long int*)ogs::hostBuf); + + if (ogs->NhaloGather) { + void* H[k]; + for (int i=0;iNhaloGather*Nbytes; + + ogsHostGatherMany(H, k, type, ogsAdd, ogs->haloGshNonSym); + + if (ogs->NownedHalo) + for (int i=0;iNlocalGather*Nbytes*i, + (char*)ogs::hostBuf+ogs->NhaloGather*Nbytes*i, + ogs->NownedHalo*Nbytes); + } + + if (!strcmp(type, "float")) + gatherMany_mul(ogs->NlocalGather, k, stride, gstride, + ogs->localGatherOffsets, + ogs->localGatherIds, (float*)v, (float*)gv); + else if (!strcmp(type, "double")) + gatherMany_mul(ogs->NlocalGather, k, stride, gstride, + ogs->localGatherOffsets, + ogs->localGatherIds, (double*)v, (double*)gv); + else if (!strcmp(type, "int")) + gatherMany_mul(ogs->NlocalGather, k, stride, gstride, + ogs->localGatherOffsets, + ogs->localGatherIds, (int*)v, (int*)gv); + else if (!strcmp(type, "long long int")) + gatherMany_mul(ogs->NlocalGather, k, stride, gstride, + ogs->localGatherOffsets, + ogs->localGatherIds, (long long int*)v, (long long int*)gv); +} + +void ogsGatherMany_min(void *gv, void *v, const int k, const dlong gstride, const dlong stride, const size_t Nbytes, const char *type, ogs_t *ogs){ + + if (!strcmp(type, "float")) + gatherMany_min(ogs->NhaloGather, k, stride, ogs->NhaloGather, + ogs->haloGatherOffsets, + ogs->haloGatherIds, (float*)v, (float*)ogs::hostBuf); + else if (!strcmp(type, "double")) + gatherMany_min(ogs->NhaloGather, k, stride, ogs->NhaloGather, + ogs->haloGatherOffsets, + ogs->haloGatherIds, (double*)v, (double*)ogs::hostBuf); + else if (!strcmp(type, "int")) + gatherMany_min(ogs->NhaloGather, k, stride, ogs->NhaloGather, + ogs->haloGatherOffsets, + ogs->haloGatherIds, (int*)v, (int*)ogs::hostBuf); + else if (!strcmp(type, "long long int")) + gatherMany_min(ogs->NhaloGather, k, stride, ogs->NhaloGather, + ogs->haloGatherOffsets, + ogs->haloGatherIds, (long long int*)v, (long long int*)ogs::hostBuf); + + if (ogs->NhaloGather) { + void* H[k]; + for (int i=0;iNhaloGather*Nbytes; + + ogsHostGatherMany(H, k, type, ogsAdd, ogs->haloGshNonSym); + + if (ogs->NownedHalo) + for (int i=0;iNlocalGather*Nbytes*i, + (char*)ogs::hostBuf+ogs->NhaloGather*Nbytes*i, + ogs->NownedHalo*Nbytes); + } + + if (!strcmp(type, "float")) + gatherMany_min(ogs->NlocalGather, k, stride, gstride, + ogs->localGatherOffsets, + ogs->localGatherIds, (float*)v, (float*)gv); + else if (!strcmp(type, "double")) + gatherMany_min(ogs->NlocalGather, k, stride, gstride, + ogs->localGatherOffsets, + ogs->localGatherIds, (double*)v, (double*)gv); + else if (!strcmp(type, "int")) + gatherMany_min(ogs->NlocalGather, k, stride, gstride, + ogs->localGatherOffsets, + ogs->localGatherIds, (int*)v, (int*)gv); + else if (!strcmp(type, "long long int")) + gatherMany_min(ogs->NlocalGather, k, stride, gstride, + ogs->localGatherOffsets, + ogs->localGatherIds, (long long int*)v, (long long int*)gv); +} + +void ogsGatherMany_max(void *gv, void *v, const int k, const dlong gstride, const dlong stride, const size_t Nbytes, const char *type, ogs_t *ogs){ + + if (!strcmp(type, "float")) + gatherMany_max(ogs->NhaloGather, k, stride, ogs->NhaloGather, + ogs->haloGatherOffsets, + ogs->haloGatherIds, (float*)v, (float*)ogs::hostBuf); + else if (!strcmp(type, "double")) + gatherMany_max(ogs->NhaloGather, k, stride, ogs->NhaloGather, + ogs->haloGatherOffsets, + ogs->haloGatherIds, (double*)v, (double*)ogs::hostBuf); + else if (!strcmp(type, "int")) + gatherMany_max(ogs->NhaloGather, k, stride, ogs->NhaloGather, + ogs->haloGatherOffsets, + ogs->haloGatherIds, (int*)v, (int*)ogs::hostBuf); + else if (!strcmp(type, "long long int")) + gatherMany_max(ogs->NhaloGather, k, stride, ogs->NhaloGather, + ogs->haloGatherOffsets, + ogs->haloGatherIds, (long long int*)v, (long long int*)ogs::hostBuf); + + if (ogs->NhaloGather) { + void* H[k]; + for (int i=0;iNhaloGather*Nbytes; + + ogsHostGatherMany(H, k, type, ogsAdd, ogs->haloGshNonSym); + + if (ogs->NownedHalo) + for (int i=0;iNlocalGather*Nbytes*i, + (char*)ogs::hostBuf+ogs->NhaloGather*Nbytes*i, + ogs->NownedHalo*Nbytes); + } + + if (!strcmp(type, "float")) + gatherMany_max(ogs->NlocalGather, k, stride, gstride, + ogs->localGatherOffsets, + ogs->localGatherIds, (float*)v, (float*)gv); + else if (!strcmp(type, "double")) + gatherMany_max(ogs->NlocalGather, k, stride, gstride, + ogs->localGatherOffsets, + ogs->localGatherIds, (double*)v, (double*)gv); + else if (!strcmp(type, "int")) + gatherMany_max(ogs->NlocalGather, k, stride, gstride, + ogs->localGatherOffsets, + ogs->localGatherIds, (int*)v, (int*)gv); + else if (!strcmp(type, "long long int")) + gatherMany_max(ogs->NlocalGather, k, stride, gstride, + ogs->localGatherOffsets, + ogs->localGatherIds, (long long int*)v, (long long int*)gv); +} + +void occaGatherMany(const dlong Ngather, + const int Nentries, + const dlong stride, + const dlong gstride, + occa::memory o_gatherStarts, + occa::memory o_gatherIds, + const char* type, + const char* op, + occa::memory o_v, + occa::memory o_gv) { + + if ((!strcmp(type, "float"))&&(!strcmp(op, "add"))) + ogs::gatherManyKernel_floatAdd(Ngather, Nentries, stride, gstride, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "float"))&&(!strcmp(op, "mul"))) + ogs::gatherManyKernel_floatMul(Ngather, Nentries, stride, gstride, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "float"))&&(!strcmp(op, "min"))) + ogs::gatherManyKernel_floatMin(Ngather, Nentries, stride, gstride, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "float"))&&(!strcmp(op, "max"))) + ogs::gatherManyKernel_floatMax(Ngather, Nentries, stride, gstride, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "double"))&&(!strcmp(op, "add"))) + ogs::gatherManyKernel_doubleAdd(Ngather, Nentries, stride, gstride, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "double"))&&(!strcmp(op, "mul"))) + ogs::gatherManyKernel_doubleMul(Ngather, Nentries, stride, gstride, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "double"))&&(!strcmp(op, "min"))) + ogs::gatherManyKernel_doubleMin(Ngather, Nentries, stride, gstride, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "double"))&&(!strcmp(op, "max"))) + ogs::gatherManyKernel_doubleMax(Ngather, Nentries, stride, gstride, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "int"))&&(!strcmp(op, "add"))) + ogs::gatherManyKernel_intAdd(Ngather, Nentries, stride, gstride, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "int"))&&(!strcmp(op, "mul"))) + ogs::gatherManyKernel_intMul(Ngather, Nentries, stride, gstride, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "int"))&&(!strcmp(op, "min"))) + ogs::gatherManyKernel_intMin(Ngather, Nentries, stride, gstride, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "int"))&&(!strcmp(op, "max"))) + ogs::gatherManyKernel_intMax(Ngather, Nentries, stride, gstride, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "long long int"))&&(!strcmp(op, "add"))) + ogs::gatherManyKernel_longAdd(Ngather, Nentries, stride, gstride, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "long long int"))&&(!strcmp(op, "mul"))) + ogs::gatherManyKernel_longMul(Ngather, Nentries, stride, gstride, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "long long int"))&&(!strcmp(op, "min"))) + ogs::gatherManyKernel_longMin(Ngather, Nentries, stride, gstride, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "long long int"))&&(!strcmp(op, "max"))) + ogs::gatherManyKernel_longMax(Ngather, Nentries, stride, gstride, o_gatherStarts, o_gatherIds, o_v, o_gv); +} diff --git a/libs/gatherScatter/src/ogsGatherScatter.cpp b/libs/gatherScatter/src/ogsGatherScatter.cpp new file mode 100644 index 000000000..f1c63f7eb --- /dev/null +++ b/libs/gatherScatter/src/ogsGatherScatter.cpp @@ -0,0 +1,154 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "ogs.hpp" +#include "ogsKernels.hpp" +#include "ogsInterface.h" + +void ogsGatherScatter(void *v, + const char *type, + const char *op, + ogs_t *ogs){ + ogsHostGatherScatter(v, type, op, ogs->hostGsh); +} + +void ogsGatherScatter(occa::memory o_v, + const char *type, + const char *op, + ogs_t *ogs){ + ogsGatherScatterStart (o_v, type, op, ogs); + ogsGatherScatterFinish(o_v, type, op, ogs); +} + +void ogsGatherScatterStart(occa::memory o_v, + const char *type, + const char *op, + ogs_t *ogs){ + size_t Nbytes; + if (!strcmp(type, "float")) + Nbytes = sizeof(float); + else if (!strcmp(type, "double")) + Nbytes = sizeof(double); + else if (!strcmp(type, "int")) + Nbytes = sizeof(int); + else if (!strcmp(type, "long long int")) + Nbytes = sizeof(long long int); + + if (ogs->NhaloGather) { + if (ogs::o_haloBuf.size() < ogs->NhaloGather*Nbytes) { + if (ogs::o_haloBuf.size()) ogs::o_haloBuf.free(); + ogs::o_haloBuf = ogs->device.mappedAlloc(ogs->NhaloGather*Nbytes); + ogs::haloBuf = ogs::o_haloBuf.getMappedPointer(); + } + } + + // gather halo nodes on device + if (ogs->NhaloGather) { + occaGather(ogs->NhaloGather, ogs->o_haloGatherOffsets, ogs->o_haloGatherIds, type, op, o_v, ogs::o_haloBuf); + + ogs->device.finish(); + ogs->device.setStream(ogs::dataStream); + ogs::o_haloBuf.copyTo(ogs::haloBuf, ogs->NhaloGather*Nbytes, 0, "async: true"); + ogs->device.setStream(ogs::defaultStream); + } +} + + +void ogsGatherScatterFinish(occa::memory o_v, + const char *type, + const char *op, + ogs_t *ogs){ + size_t Nbytes; + if (!strcmp(type, "float")) + Nbytes = sizeof(float); + else if (!strcmp(type, "double")) + Nbytes = sizeof(double); + else if (!strcmp(type, "int")) + Nbytes = sizeof(int); + else if (!strcmp(type, "long long int")) + Nbytes = sizeof(long long int); + + if(ogs->NlocalGather) { + occaGatherScatter(ogs->NlocalGather, ogs->o_localGatherOffsets, ogs->o_localGatherIds, type, op, o_v); + } + + if (ogs->NhaloGather) { + ogs->device.setStream(ogs::dataStream); + ogs->device.finish(); + + // MPI based gather scatter using libgs + ogsHostGatherScatter(ogs::haloBuf, type, op, ogs->haloGshSym); + + // copy totally gather halo data back from HOST to DEVICE + ogs::o_haloBuf.copyFrom(ogs::haloBuf, ogs->NhaloGather*Nbytes, 0, "async: true"); + + // do scatter back to local nodes + occaScatter(ogs->NhaloGather, ogs->o_haloGatherOffsets, ogs->o_haloGatherIds, type, op, ogs::o_haloBuf, o_v); + ogs->device.finish(); + ogs->device.setStream(ogs::defaultStream); + } +} + +void occaGatherScatter(const dlong Ngather, + occa::memory o_gatherStarts, + occa::memory o_gatherIds, + const char* type, + const char* op, + occa::memory o_v) { + + if ((!strcmp(type, "float"))&&(!strcmp(op, "add"))) + ogs::gatherScatterKernel_floatAdd(Ngather, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "float"))&&(!strcmp(op, "mul"))) + ogs::gatherScatterKernel_floatMul(Ngather, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "float"))&&(!strcmp(op, "min"))) + ogs::gatherScatterKernel_floatMin(Ngather, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "float"))&&(!strcmp(op, "max"))) + ogs::gatherScatterKernel_floatMax(Ngather, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "double"))&&(!strcmp(op, "add"))) + ogs::gatherScatterKernel_doubleAdd(Ngather, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "double"))&&(!strcmp(op, "mul"))) + ogs::gatherScatterKernel_doubleMul(Ngather, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "double"))&&(!strcmp(op, "min"))) + ogs::gatherScatterKernel_doubleMin(Ngather, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "double"))&&(!strcmp(op, "max"))) + ogs::gatherScatterKernel_doubleMax(Ngather, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "int"))&&(!strcmp(op, "add"))) + ogs::gatherScatterKernel_intAdd(Ngather, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "int"))&&(!strcmp(op, "mul"))) + ogs::gatherScatterKernel_intMul(Ngather, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "int"))&&(!strcmp(op, "min"))) + ogs::gatherScatterKernel_intMin(Ngather, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "int"))&&(!strcmp(op, "max"))) + ogs::gatherScatterKernel_intMax(Ngather, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "long long int"))&&(!strcmp(op, "add"))) + ogs::gatherScatterKernel_longAdd(Ngather, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "long long int"))&&(!strcmp(op, "mul"))) + ogs::gatherScatterKernel_longMul(Ngather, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "long long int"))&&(!strcmp(op, "min"))) + ogs::gatherScatterKernel_longMin(Ngather, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "long long int"))&&(!strcmp(op, "max"))) + ogs::gatherScatterKernel_longMax(Ngather, o_gatherStarts, o_gatherIds, o_v); +} \ No newline at end of file diff --git a/libs/gatherScatter/src/ogsGatherScatterMany.cpp b/libs/gatherScatter/src/ogsGatherScatterMany.cpp new file mode 100644 index 000000000..7827a4604 --- /dev/null +++ b/libs/gatherScatter/src/ogsGatherScatterMany.cpp @@ -0,0 +1,182 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "ogs.hpp" +#include "ogsKernels.hpp" +#include "ogsInterface.h" + +void ogsGatherScatterMany(void *v, + const int k, + const dlong stride, + const char *type, + const char *op, + ogs_t *ogs){ + + size_t Nbytes; + if (!strcmp(type, "float")) + Nbytes = sizeof(float); + else if (!strcmp(type, "double")) + Nbytes = sizeof(double); + else if (!strcmp(type, "int")) + Nbytes = sizeof(int); + else if (!strcmp(type, "long long int")) + Nbytes = sizeof(long long int); + + void* V[k]; + for (int i=0;ihostGsh); +} + +void ogsGatherScatterMany(occa::memory o_v, + const int k, + const dlong stride, + const char *type, + const char *op, + ogs_t *ogs){ + ogsGatherScatterManyStart (o_v, k, stride, type, op, ogs); + ogsGatherScatterManyFinish(o_v, k, stride, type, op, ogs); +} + +void ogsGatherScatterManyStart(occa::memory o_v, + const int k, + const dlong stride, + const char *type, + const char *op, + ogs_t *ogs){ + size_t Nbytes; + if (!strcmp(type, "float")) + Nbytes = sizeof(float); + else if (!strcmp(type, "double")) + Nbytes = sizeof(double); + else if (!strcmp(type, "int")) + Nbytes = sizeof(int); + else if (!strcmp(type, "long long int")) + Nbytes = sizeof(long long int); + + if (ogs->NhaloGather) { + if (ogs::o_haloBuf.size() < ogs->NhaloGather*Nbytes*k) { + if (ogs::o_haloBuf.size()) ogs::o_haloBuf.free(); + ogs::o_haloBuf = ogs->device.mappedAlloc(ogs->NhaloGather*Nbytes*k); + ogs::haloBuf = ogs::o_haloBuf.getMappedPointer(); + } + } + + // gather halo nodes on device + if (ogs->NhaloGather) { + occaGatherMany(ogs->NhaloGather, k, stride, ogs->NhaloGather, ogs->o_haloGatherOffsets, ogs->o_haloGatherIds, type, op, o_v, ogs::o_haloBuf); + + ogs->device.finish(); + ogs->device.setStream(ogs::dataStream); + ogs::o_haloBuf.copyTo(ogs::haloBuf, ogs->NhaloGather*Nbytes*k, 0, "async: true"); + ogs->device.setStream(ogs::defaultStream); + } +} + + +void ogsGatherScatterManyFinish(occa::memory o_v, + const int k, + const dlong stride, + const char *type, + const char *op, + ogs_t *ogs){ + + size_t Nbytes; + if (!strcmp(type, "float")) + Nbytes = sizeof(float); + else if (!strcmp(type, "double")) + Nbytes = sizeof(double); + else if (!strcmp(type, "int")) + Nbytes = sizeof(int); + else if (!strcmp(type, "long long int")) + Nbytes = sizeof(long long int); + + if(ogs->NlocalGather) { + occaGatherScatterMany(ogs->NlocalGather, k, stride, ogs->o_localGatherOffsets, ogs->o_localGatherIds, type, op, o_v); + } + + if (ogs->NhaloGather) { + ogs->device.setStream(ogs::dataStream); + ogs->device.finish(); + + void* H[k]; + for (int i=0;iNhaloGather*Nbytes; + + // MPI based gather scatter using libgs + ogsHostGatherScatterMany(H, k, type, op, ogs->haloGshSym); + + // copy totally gather halo data back from HOST to DEVICE + ogs::o_haloBuf.copyFrom(ogs::haloBuf, ogs->NhaloGather*Nbytes*k, 0, "async: true"); + + // do scatter back to local nodes + occaScatterMany(ogs->NhaloGather, k, ogs->NhaloGather, stride, ogs->o_haloGatherOffsets, ogs->o_haloGatherIds, type, op, ogs::o_haloBuf, o_v); + ogs->device.finish(); + ogs->device.setStream(ogs::defaultStream); + } +} + +void occaGatherScatterMany(const dlong Ngather, + const int Nentries, + const dlong stride, + occa::memory o_gatherStarts, + occa::memory o_gatherIds, + const char* type, + const char* op, + occa::memory o_v) { + + if ((!strcmp(type, "float"))&&(!strcmp(op, "add"))) + ogs::gatherScatterManyKernel_floatAdd(Ngather, Nentries, stride, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "float"))&&(!strcmp(op, "mul"))) + ogs::gatherScatterManyKernel_floatMul(Ngather, Nentries, stride, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "float"))&&(!strcmp(op, "min"))) + ogs::gatherScatterManyKernel_floatMin(Ngather, Nentries, stride, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "float"))&&(!strcmp(op, "max"))) + ogs::gatherScatterManyKernel_floatMax(Ngather, Nentries, stride, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "double"))&&(!strcmp(op, "add"))) + ogs::gatherScatterManyKernel_doubleAdd(Ngather, Nentries, stride, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "double"))&&(!strcmp(op, "mul"))) + ogs::gatherScatterManyKernel_doubleMul(Ngather, Nentries, stride, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "double"))&&(!strcmp(op, "min"))) + ogs::gatherScatterManyKernel_doubleMin(Ngather, Nentries, stride, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "double"))&&(!strcmp(op, "max"))) + ogs::gatherScatterManyKernel_doubleMax(Ngather, Nentries, stride, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "int"))&&(!strcmp(op, "add"))) + ogs::gatherScatterManyKernel_intAdd(Ngather, Nentries, stride, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "int"))&&(!strcmp(op, "mul"))) + ogs::gatherScatterManyKernel_intMul(Ngather, Nentries, stride, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "int"))&&(!strcmp(op, "min"))) + ogs::gatherScatterManyKernel_intMin(Ngather, Nentries, stride, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "int"))&&(!strcmp(op, "max"))) + ogs::gatherScatterManyKernel_intMax(Ngather, Nentries, stride, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "long long int"))&&(!strcmp(op, "add"))) + ogs::gatherScatterManyKernel_longAdd(Ngather, Nentries, stride, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "long long int"))&&(!strcmp(op, "mul"))) + ogs::gatherScatterManyKernel_longMul(Ngather, Nentries, stride, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "long long int"))&&(!strcmp(op, "min"))) + ogs::gatherScatterManyKernel_longMin(Ngather, Nentries, stride, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "long long int"))&&(!strcmp(op, "max"))) + ogs::gatherScatterManyKernel_longMax(Ngather, Nentries, stride, o_gatherStarts, o_gatherIds, o_v); +} \ No newline at end of file diff --git a/libs/gatherScatter/src/ogsGatherScatterVec.cpp b/libs/gatherScatter/src/ogsGatherScatterVec.cpp new file mode 100644 index 000000000..b8554ecf2 --- /dev/null +++ b/libs/gatherScatter/src/ogsGatherScatterVec.cpp @@ -0,0 +1,160 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "ogs.hpp" +#include "ogsKernels.hpp" +#include "ogsInterface.h" + +void ogsGatherScatterVec(void *v, + const int k, + const char *type, + const char *op, + ogs_t *ogs){ + ogsHostGatherScatterVec(v, k, type, op, ogs->hostGsh); +} + +void ogsGatherScatterVec(occa::memory o_v, + const int k, + const char *type, + const char *op, + ogs_t *ogs){ + ogsGatherScatterVecStart (o_v, k, type, op, ogs); + ogsGatherScatterVecFinish(o_v, k, type, op, ogs); +} + +void ogsGatherScatterVecStart(occa::memory o_v, + const int k, + const char *type, + const char *op, + ogs_t *ogs){ + size_t Nbytes; + if (!strcmp(type, "float")) + Nbytes = sizeof(float); + else if (!strcmp(type, "double")) + Nbytes = sizeof(double); + else if (!strcmp(type, "int")) + Nbytes = sizeof(int); + else if (!strcmp(type, "long long int")) + Nbytes = sizeof(long long int); + + if (ogs->NhaloGather) { + if (ogs::o_haloBuf.size() < ogs->NhaloGather*Nbytes*k) { + if (ogs::o_haloBuf.size()) ogs::o_haloBuf.free(); + ogs::o_haloBuf = ogs->device.mappedAlloc(ogs->NhaloGather*Nbytes*k); + ogs::haloBuf = ogs::o_haloBuf.getMappedPointer(); + } + } + + // gather halo nodes on device + if (ogs->NhaloGather) { + occaGatherVec(ogs->NhaloGather, k, ogs->o_haloGatherOffsets, ogs->o_haloGatherIds, type, op, o_v, ogs::o_haloBuf); + + ogs->device.finish(); + ogs->device.setStream(ogs::dataStream); + ogs::o_haloBuf.copyTo(ogs::haloBuf, ogs->NhaloGather*Nbytes*k, 0, "async: true"); + ogs->device.setStream(ogs::defaultStream); + } +} + + +void ogsGatherScatterVecFinish(occa::memory o_v, + const int k, + const char *type, + const char *op, + ogs_t *ogs){ + + size_t Nbytes; + if (!strcmp(type, "float")) + Nbytes = sizeof(float); + else if (!strcmp(type, "double")) + Nbytes = sizeof(double); + else if (!strcmp(type, "int")) + Nbytes = sizeof(int); + else if (!strcmp(type, "long long int")) + Nbytes = sizeof(long long int); + + if(ogs->NlocalGather) { + occaGatherScatterVec(ogs->NlocalGather, k, ogs->o_localGatherOffsets, ogs->o_localGatherIds, type, op, o_v); + } + + if (ogs->NhaloGather) { + ogs->device.setStream(ogs::dataStream); + ogs->device.finish(); + + // MPI based gather scatter using libgs + ogsHostGatherScatterVec(ogs::haloBuf, k, type, op, ogs->haloGshSym); + + // copy totally gather halo data back from HOST to DEVICE + ogs::o_haloBuf.copyFrom(ogs::haloBuf, ogs->NhaloGather*Nbytes*k, 0, "async: true"); + + // do scatter back to local nodes + occaScatterVec(ogs->NhaloGather, k, ogs->o_haloGatherOffsets, ogs->o_haloGatherIds, type, op, ogs::o_haloBuf, o_v); + ogs->device.finish(); + ogs->device.setStream(ogs::defaultStream); + } +} + +void occaGatherScatterVec(const dlong Ngather, + const int Nentries, + occa::memory o_gatherStarts, + occa::memory o_gatherIds, + const char* type, + const char* op, + occa::memory o_v) { + + if ((!strcmp(type, "float"))&&(!strcmp(op, "add"))) + ogs::gatherScatterVecKernel_floatAdd(Ngather, Nentries, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "float"))&&(!strcmp(op, "mul"))) + ogs::gatherScatterVecKernel_floatMul(Ngather, Nentries, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "float"))&&(!strcmp(op, "min"))) + ogs::gatherScatterVecKernel_floatMin(Ngather, Nentries, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "float"))&&(!strcmp(op, "max"))) + ogs::gatherScatterVecKernel_floatMax(Ngather, Nentries, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "double"))&&(!strcmp(op, "add"))) + ogs::gatherScatterVecKernel_doubleAdd(Ngather, Nentries, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "double"))&&(!strcmp(op, "mul"))) + ogs::gatherScatterVecKernel_doubleMul(Ngather, Nentries, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "double"))&&(!strcmp(op, "min"))) + ogs::gatherScatterVecKernel_doubleMin(Ngather, Nentries, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "double"))&&(!strcmp(op, "max"))) + ogs::gatherScatterVecKernel_doubleMax(Ngather, Nentries, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "int"))&&(!strcmp(op, "add"))) + ogs::gatherScatterVecKernel_intAdd(Ngather, Nentries, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "int"))&&(!strcmp(op, "mul"))) + ogs::gatherScatterVecKernel_intMul(Ngather, Nentries, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "int"))&&(!strcmp(op, "min"))) + ogs::gatherScatterVecKernel_intMin(Ngather, Nentries, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "int"))&&(!strcmp(op, "max"))) + ogs::gatherScatterVecKernel_intMax(Ngather, Nentries, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "long long int"))&&(!strcmp(op, "add"))) + ogs::gatherScatterVecKernel_longAdd(Ngather, Nentries, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "long long int"))&&(!strcmp(op, "mul"))) + ogs::gatherScatterVecKernel_longMul(Ngather, Nentries, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "long long int"))&&(!strcmp(op, "min"))) + ogs::gatherScatterVecKernel_longMin(Ngather, Nentries, o_gatherStarts, o_gatherIds, o_v); + else if ((!strcmp(type, "long long int"))&&(!strcmp(op, "max"))) + ogs::gatherScatterVecKernel_longMax(Ngather, Nentries, o_gatherStarts, o_gatherIds, o_v); +} \ No newline at end of file diff --git a/libs/gatherScatter/src/ogsGatherVec.cpp b/libs/gatherScatter/src/ogsGatherVec.cpp new file mode 100644 index 000000000..f15438ac9 --- /dev/null +++ b/libs/gatherScatter/src/ogsGatherVec.cpp @@ -0,0 +1,339 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "ogs.hpp" +#include "ogsKernels.hpp" +#include "ogsInterface.h" + +#include "gatherVec.tpp" + +void ogsGatherVec_add(void *gv, void *v, const int k, const size_t Nbytes, const char *type, ogs_t *ogs); +void ogsGatherVec_mul(void *gv, void *v, const int k, const size_t Nbytes, const char *type, ogs_t *ogs); +void ogsGatherVec_min(void *gv, void *v, const int k, const size_t Nbytes, const char *type, ogs_t *ogs); +void ogsGatherVec_max(void *gv, void *v, const int k, const size_t Nbytes, const char *type, ogs_t *ogs); + +void ogsGatherVec(occa::memory o_gv, + occa::memory o_v, + const int k, + const char *type, + const char *op, + ogs_t *ogs){ + ogsGatherVecStart (o_gv, o_v, k, type, op, ogs); + ogsGatherVecFinish(o_gv, o_v, k, type, op, ogs); +} + +void ogsGatherVecStart(occa::memory o_gv, + occa::memory o_v, + const int k, + const char *type, + const char *op, + ogs_t *ogs){ + size_t Nbytes; + if (!strcmp(type, "float")) + Nbytes = sizeof(float); + else if (!strcmp(type, "double")) + Nbytes = sizeof(double); + else if (!strcmp(type, "int")) + Nbytes = sizeof(int); + else if (!strcmp(type, "long long int")) + Nbytes = sizeof(long long int); + + if (ogs->NhaloGather) { + if (ogs::o_haloBuf.size() < ogs->NhaloGather*Nbytes*k) { + if (ogs::o_haloBuf.size()) ogs::o_haloBuf.free(); + ogs::o_haloBuf = ogs->device.mappedAlloc(ogs->NhaloGather*Nbytes*k); + ogs::haloBuf = ogs::o_haloBuf.getMappedPointer(); + } + } + + // gather halo nodes on device + if (ogs->NhaloGather) { + occaGatherVec(ogs->NhaloGather, k, ogs->o_haloGatherOffsets, ogs->o_haloGatherIds, type, op, o_v, ogs::o_haloBuf); + + ogs->device.finish(); + ogs->device.setStream(ogs::dataStream); + ogs::o_haloBuf.copyTo(ogs::haloBuf, ogs->NhaloGather*Nbytes*k, 0, "async: true"); + ogs->device.setStream(ogs::defaultStream); + } +} + + +void ogsGatherVecFinish(occa::memory o_gv, + occa::memory o_v, + const int k, + const char *type, + const char *op, + ogs_t *ogs){ + size_t Nbytes; + if (!strcmp(type, "float")) + Nbytes = sizeof(float); + else if (!strcmp(type, "double")) + Nbytes = sizeof(double); + else if (!strcmp(type, "int")) + Nbytes = sizeof(int); + else if (!strcmp(type, "long long int")) + Nbytes = sizeof(long long int); + + if(ogs->NlocalGather) { + occaGatherVec(ogs->NlocalGather, k, ogs->o_localGatherOffsets, ogs->o_localGatherIds, type, op, o_v, o_gv); + } + + if (ogs->NhaloGather) { + ogs->device.setStream(ogs::dataStream); + ogs->device.finish(); + + // MPI based gather using libgs + ogsHostGatherVec(ogs::haloBuf, k, type, op, ogs->haloGshNonSym); + + // copy totally gather halo data back from HOST to DEVICE + if (ogs->NownedHalo) + o_gv.copyFrom(ogs::haloBuf, ogs->NownedHalo*Nbytes*k, + ogs->NlocalGather*Nbytes*k, "async: true"); + + ogs->device.finish(); + ogs->device.setStream(ogs::defaultStream); + } +} + +void ogsGatherVec(void *gv, + void *v, + const int k, + const char *type, + const char *op, + ogs_t *ogs){ + size_t Nbytes; + if (!strcmp(type, "float")) + Nbytes = sizeof(float); + else if (!strcmp(type, "double")) + Nbytes = sizeof(double); + else if (!strcmp(type, "int")) + Nbytes = sizeof(int); + else if (!strcmp(type, "long long int")) + Nbytes = sizeof(long long int); + + if (ogs->NhaloGather) { + if (ogs::hostBufSize < ogs->NhaloGather*Nbytes*k) { + if (ogs::hostBufSize) free(ogs::hostBuf); + ogs::hostBuf = (void *) malloc(ogs->NhaloGather*Nbytes*k); + } + } + + if (!strcmp(op, "add")) + ogsGatherVec_add(gv, v, k, Nbytes, type, ogs); + else if (!strcmp(op, "mul")) + ogsGatherVec_mul(gv, v, k, Nbytes, type, ogs); + else if (!strcmp(op, "min")) + ogsGatherVec_min(gv, v, k, Nbytes, type, ogs); + else if (!strcmp(op, "max")) + ogsGatherVec_max(gv, v, k, Nbytes, type, ogs); +} + +void ogsGatherVec_add(void *gv, void *v, const int k, const size_t Nbytes, const char *type, ogs_t *ogs){ + + if (!strcmp(type, "float")) + gatherVec_add(ogs->NhaloGather, k, ogs->haloGatherOffsets, + ogs->haloGatherIds, (float*)v, (float*)ogs::hostBuf); + else if (!strcmp(type, "double")) + gatherVec_add(ogs->NhaloGather, k, ogs->haloGatherOffsets, + ogs->haloGatherIds, (double*)v, (double*)ogs::hostBuf); + else if (!strcmp(type, "int")) + gatherVec_add(ogs->NhaloGather, k, ogs->haloGatherOffsets, + ogs->haloGatherIds, (int*)v, (int*)ogs::hostBuf); + else if (!strcmp(type, "long long int")) + gatherVec_add(ogs->NhaloGather, k, ogs->haloGatherOffsets, + ogs->haloGatherIds, (long long int*)v, (long long int*)ogs::hostBuf); + + if (ogs->NhaloGather) { + ogsHostGatherVec(ogs::hostBuf, k, type, ogsAdd, ogs->haloGshNonSym); + + if (ogs->NownedHalo) + memcpy((char*)gv+ogs->NlocalGather*Nbytes*k, ogs::hostBuf, ogs->NownedHalo*Nbytes*k); + } + + if (!strcmp(type, "float")) + gatherVec_add(ogs->NlocalGather, k, ogs->localGatherOffsets, + ogs->localGatherIds, (float*)v, (float*)gv); + else if (!strcmp(type, "double")) + gatherVec_add(ogs->NlocalGather, k, ogs->localGatherOffsets, + ogs->localGatherIds, (double*)v, (double*)gv); + else if (!strcmp(type, "int")) + gatherVec_add(ogs->NlocalGather, k, ogs->localGatherOffsets, + ogs->localGatherIds, (int*)v, (int*)gv); + else if (!strcmp(type, "long long int")) + gatherVec_add(ogs->NlocalGather, k, ogs->localGatherOffsets, + ogs->localGatherIds, (long long int*)v, (long long int*)gv); +} + +void ogsGatherVec_mul(void *gv, void *v, const int k, const size_t Nbytes, const char *type, ogs_t *ogs){ + + if (!strcmp(type, "float")) + gatherVec_mul(ogs->NhaloGather, k, ogs->haloGatherOffsets, + ogs->haloGatherIds, (float*)v, (float*)ogs::hostBuf); + else if (!strcmp(type, "double")) + gatherVec_mul(ogs->NhaloGather, k, ogs->haloGatherOffsets, + ogs->haloGatherIds, (double*)v, (double*)ogs::hostBuf); + else if (!strcmp(type, "int")) + gatherVec_mul(ogs->NhaloGather, k, ogs->haloGatherOffsets, + ogs->haloGatherIds, (int*)v, (int*)ogs::hostBuf); + else if (!strcmp(type, "long long int")) + gatherVec_mul(ogs->NhaloGather, k, ogs->haloGatherOffsets, + ogs->haloGatherIds, (long long int*)v, (long long int*)ogs::hostBuf); + + if (ogs->NhaloGather) { + ogsHostGatherVec(ogs::hostBuf, k, type, ogsMul, ogs->haloGshNonSym); + + if (ogs->NownedHalo) + memcpy((char*)gv+ogs->NlocalGather*Nbytes*k, ogs::hostBuf, ogs->NownedHalo*Nbytes*k); + } + + if (!strcmp(type, "float")) + gatherVec_mul(ogs->NlocalGather, k, ogs->localGatherOffsets, + ogs->localGatherIds, (float*)v, (float*)gv); + else if (!strcmp(type, "double")) + gatherVec_mul(ogs->NlocalGather, k, ogs->localGatherOffsets, + ogs->localGatherIds, (double*)v, (double*)gv); + else if (!strcmp(type, "int")) + gatherVec_mul(ogs->NlocalGather, k, ogs->localGatherOffsets, + ogs->localGatherIds, (int*)v, (int*)gv); + else if (!strcmp(type, "long long int")) + gatherVec_mul(ogs->NlocalGather, k, ogs->localGatherOffsets, + ogs->localGatherIds, (long long int*)v, (long long int*)gv); +} + +void ogsGatherVec_min(void *gv, void *v, const int k, const size_t Nbytes, const char *type, ogs_t *ogs){ + + if (!strcmp(type, "float")) + gatherVec_min(ogs->NhaloGather, k, ogs->haloGatherOffsets, + ogs->haloGatherIds, (float*)v, (float*)ogs::hostBuf); + else if (!strcmp(type, "double")) + gatherVec_min(ogs->NhaloGather, k, ogs->haloGatherOffsets, + ogs->haloGatherIds, (double*)v, (double*)ogs::hostBuf); + else if (!strcmp(type, "int")) + gatherVec_min(ogs->NhaloGather, k, ogs->haloGatherOffsets, + ogs->haloGatherIds, (int*)v, (int*)ogs::hostBuf); + else if (!strcmp(type, "long long int")) + gatherVec_min(ogs->NhaloGather, k, ogs->haloGatherOffsets, + ogs->haloGatherIds, (long long int*)v, (long long int*)ogs::hostBuf); + + if (ogs->NhaloGather) { + ogsHostGatherVec(ogs::hostBuf, k, type, ogsMin, ogs->haloGshNonSym); + + if (ogs->NownedHalo) + memcpy((char*)gv+ogs->NlocalGather*Nbytes*k, ogs::hostBuf, ogs->NownedHalo*Nbytes*k); + } + + if (!strcmp(type, "float")) + gatherVec_min(ogs->NlocalGather, k, ogs->localGatherOffsets, + ogs->localGatherIds, (float*)v, (float*)gv); + else if (!strcmp(type, "double")) + gatherVec_min(ogs->NlocalGather, k, ogs->localGatherOffsets, + ogs->localGatherIds, (double*)v, (double*)gv); + else if (!strcmp(type, "int")) + gatherVec_min(ogs->NlocalGather, k, ogs->localGatherOffsets, + ogs->localGatherIds, (int*)v, (int*)gv); + else if (!strcmp(type, "long long int")) + gatherVec_min(ogs->NlocalGather, k, ogs->localGatherOffsets, + ogs->localGatherIds, (long long int*)v, (long long int*)gv); +} + +void ogsGatherVec_max(void *gv, void *v, const int k, const size_t Nbytes, const char *type, ogs_t *ogs){ + + if (!strcmp(type, "float")) + gatherVec_max(ogs->NhaloGather, k, ogs->haloGatherOffsets, + ogs->haloGatherIds, (float*)v, (float*)ogs::hostBuf); + else if (!strcmp(type, "double")) + gatherVec_max(ogs->NhaloGather, k, ogs->haloGatherOffsets, + ogs->haloGatherIds, (double*)v, (double*)ogs::hostBuf); + else if (!strcmp(type, "int")) + gatherVec_max(ogs->NhaloGather, k, ogs->haloGatherOffsets, + ogs->haloGatherIds, (int*)v, (int*)ogs::hostBuf); + else if (!strcmp(type, "long long int")) + gatherVec_max(ogs->NhaloGather, k, ogs->haloGatherOffsets, + ogs->haloGatherIds, (long long int*)v, (long long int*)ogs::hostBuf); + + if (ogs->NhaloGather) { + ogsHostGatherVec(ogs::hostBuf, k, type, ogsMax, ogs->haloGshNonSym); + + if (ogs->NownedHalo) + memcpy((char*)gv+ogs->NlocalGather*Nbytes*k, ogs::hostBuf, ogs->NownedHalo*Nbytes*k); + } + + if (!strcmp(type, "float")) + gatherVec_max(ogs->NlocalGather, k, ogs->localGatherOffsets, + ogs->localGatherIds, (float*)v, (float*)gv); + else if (!strcmp(type, "double")) + gatherVec_max(ogs->NlocalGather, k, ogs->localGatherOffsets, + ogs->localGatherIds, (double*)v, (double*)gv); + else if (!strcmp(type, "int")) + gatherVec_max(ogs->NlocalGather, k, ogs->localGatherOffsets, + ogs->localGatherIds, (int*)v, (int*)gv); + else if (!strcmp(type, "long long int")) + gatherVec_max(ogs->NlocalGather, k, ogs->localGatherOffsets, + ogs->localGatherIds, (long long int*)v, (long long int*)gv); +} + +void occaGatherVec(const dlong Ngather, + const int Nentries, + occa::memory o_gatherStarts, + occa::memory o_gatherIds, + const char* type, + const char* op, + occa::memory o_v, + occa::memory o_gv) { + + if ((!strcmp(type, "float"))&&(!strcmp(op, "add"))) + ogs::gatherVecKernel_floatAdd(Ngather, Nentries, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "float"))&&(!strcmp(op, "mul"))) + ogs::gatherVecKernel_floatMul(Ngather, Nentries, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "float"))&&(!strcmp(op, "min"))) + ogs::gatherVecKernel_floatMin(Ngather, Nentries, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "float"))&&(!strcmp(op, "max"))) + ogs::gatherVecKernel_floatMax(Ngather, Nentries, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "double"))&&(!strcmp(op, "add"))) + ogs::gatherVecKernel_doubleAdd(Ngather, Nentries, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "double"))&&(!strcmp(op, "mul"))) + ogs::gatherVecKernel_doubleMul(Ngather, Nentries, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "double"))&&(!strcmp(op, "min"))) + ogs::gatherVecKernel_doubleMin(Ngather, Nentries, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "double"))&&(!strcmp(op, "max"))) + ogs::gatherVecKernel_doubleMax(Ngather, Nentries, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "int"))&&(!strcmp(op, "add"))) + ogs::gatherVecKernel_intAdd(Ngather, Nentries, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "int"))&&(!strcmp(op, "mul"))) + ogs::gatherVecKernel_intMul(Ngather, Nentries, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "int"))&&(!strcmp(op, "min"))) + ogs::gatherVecKernel_intMin(Ngather, Nentries, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "int"))&&(!strcmp(op, "max"))) + ogs::gatherVecKernel_intMax(Ngather, Nentries, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "long long int"))&&(!strcmp(op, "add"))) + ogs::gatherVecKernel_longAdd(Ngather, Nentries, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "long long int"))&&(!strcmp(op, "mul"))) + ogs::gatherVecKernel_longMul(Ngather, Nentries, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "long long int"))&&(!strcmp(op, "min"))) + ogs::gatherVecKernel_longMin(Ngather, Nentries, o_gatherStarts, o_gatherIds, o_v, o_gv); + else if ((!strcmp(type, "long long int"))&&(!strcmp(op, "max"))) + ogs::gatherVecKernel_longMax(Ngather, Nentries, o_gatherStarts, o_gatherIds, o_v, o_gv); +} \ No newline at end of file diff --git a/libs/gatherScatter/src/ogsHostGather.c b/libs/gatherScatter/src/ogsHostGather.c new file mode 100644 index 000000000..1cf6c7630 --- /dev/null +++ b/libs/gatherScatter/src/ogsHostGather.c @@ -0,0 +1,87 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +/* compile with C compiler (not C++) */ + +#include +#include +#include +#include + +#include "gslib.h" + +void ogsHostGather(void *v, const char *type, const char *op, void *gsh){ + + /* need gs_float or gs_double */ + if(!strcmp(type, "float")){ + // printf("performing string gs on %s\n", type); + if(!strcmp(op, "add")) + gs(v, gs_float, gs_add, 1, gsh, 0); + else if(!strcmp(op, "mul")) + gs(v, gs_float, gs_mul, 1, gsh, 0); + else if(!strcmp(op, "min")) + gs(v, gs_float, gs_min, 1, gsh, 0); + else if(!strcmp(op, "max")) + gs(v, gs_float, gs_max, 1, gsh, 0); + } + + if(!strcmp(type, "double")){ + // printf("performing double gs on %s\n", type); + if(!strcmp(op, "add")) + gs(v, gs_double, gs_add, 1, gsh, 0); + else if(!strcmp(op, "mul")) + gs(v, gs_double, gs_mul, 1, gsh, 0); + else if(!strcmp(op, "min")) + gs(v, gs_double, gs_min, 1, gsh, 0); + else if(!strcmp(op, "max")) + gs(v, gs_double, gs_max, 1, gsh, 0); + } + + if(!strcmp(type, "int")){ + // printf("performing int gs\n"); + if(!strcmp(op, "add")) + gs(v, gs_int, gs_add, 1, gsh, 0); + else if(!strcmp(op, "mul")) + gs(v, gs_int, gs_mul, 1, gsh, 0); + else if(!strcmp(op, "min")) + gs(v, gs_int, gs_min, 1, gsh, 0); + else if(!strcmp(op, "max")) + gs(v, gs_int, gs_max, 1, gsh, 0); + } + + if(!strcmp(type, "long long int")){ + // printf("performing long_long gs\n"); + if(!strcmp(op, "add")) + gs(v, gs_long_long, gs_add, 1, gsh, 0); + else if(!strcmp(op, "mul")) + gs(v, gs_long_long, gs_mul, 1, gsh, 0); + else if(!strcmp(op, "min")) + gs(v, gs_long_long, gs_min, 1, gsh, 0); + else if(!strcmp(op, "max")) + gs(v, gs_long_long, gs_max, 1, gsh, 0); + } +} + diff --git a/libs/gatherScatter/src/ogsHostGatherMany.c b/libs/gatherScatter/src/ogsHostGatherMany.c new file mode 100644 index 000000000..88d9d3c1c --- /dev/null +++ b/libs/gatherScatter/src/ogsHostGatherMany.c @@ -0,0 +1,87 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +/* compile with C compiler (not C++) */ + +#include +#include +#include +#include + +#include "gslib.h" + +void ogsHostGatherMany(void **v, const int k, const char *type, const char *op, void *gsh){ + + /* need gs_float or gs_double */ + if(!strcmp(type, "float")){ + // printf("performing string gs on %s\n", type); + if(!strcmp(op, "add")) + gs_many(v, k, gs_float, gs_add, 1, gsh, 0); + else if(!strcmp(op, "mul")) + gs_many(v, k, gs_float, gs_mul, 1, gsh, 0); + else if(!strcmp(op, "min")) + gs_many(v, k, gs_float, gs_min, 1, gsh, 0); + else if(!strcmp(op, "max")) + gs_many(v, k, gs_float, gs_max, 1, gsh, 0); + } + + if(!strcmp(type, "double")){ + // printf("performing double gs on %s\n", type); + if(!strcmp(op, "add")) + gs_many(v, k, gs_double, gs_add, 1, gsh, 0); + else if(!strcmp(op, "mul")) + gs_many(v, k, gs_double, gs_mul, 1, gsh, 0); + else if(!strcmp(op, "min")) + gs_many(v, k, gs_double, gs_min, 1, gsh, 0); + else if(!strcmp(op, "max")) + gs_many(v, k, gs_double, gs_max, 1, gsh, 0); + } + + if(!strcmp(type, "int")){ + // printf("performing int gs\n"); + if(!strcmp(op, "add")) + gs_many(v, k, gs_int, gs_add, 1, gsh, 0); + else if(!strcmp(op, "mul")) + gs_many(v, k, gs_int, gs_mul, 1, gsh, 0); + else if(!strcmp(op, "min")) + gs_many(v, k, gs_int, gs_min, 1, gsh, 0); + else if(!strcmp(op, "max")) + gs_many(v, k, gs_int, gs_max, 1, gsh, 0); + } + + if(!strcmp(type, "long long int")){ + // printf("performing long_long gs\n"); + if(!strcmp(op, "add")) + gs_many(v, k, gs_long_long, gs_add, 1, gsh, 0); + else if(!strcmp(op, "mul")) + gs_many(v, k, gs_long_long, gs_mul, 1, gsh, 0); + else if(!strcmp(op, "min")) + gs_many(v, k, gs_long_long, gs_min, 1, gsh, 0); + else if(!strcmp(op, "max")) + gs_many(v, k, gs_long_long, gs_max, 1, gsh, 0); + } +} + diff --git a/libs/gatherScatter/src/ogsHostGatherScatter.c b/libs/gatherScatter/src/ogsHostGatherScatter.c new file mode 100644 index 000000000..68bb497f9 --- /dev/null +++ b/libs/gatherScatter/src/ogsHostGatherScatter.c @@ -0,0 +1,86 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +/* compile with C compiler (not C++) */ + +#include +#include +#include +#include + +#include "gslib.h" + +void ogsHostGatherScatter(void *v, const char *type, const char *op, void *gsh){ + + /* need gs_float or gs_double */ + if(!strcmp(type, "float")){ + // printf("performing string gs on %s\n", type); + if(!strcmp(op, "add")) + gs(v, gs_float, gs_add, 0, gsh, 0); + else if(!strcmp(op, "mul")) + gs(v, gs_float, gs_mul, 0, gsh, 0); + else if(!strcmp(op, "min")) + gs(v, gs_float, gs_min, 0, gsh, 0); + else if(!strcmp(op, "max")) + gs(v, gs_float, gs_max, 0, gsh, 0); + } + + if(!strcmp(type, "double")){ + // printf("performing double gs on %s\n", type); + if(!strcmp(op, "add")) + gs(v, gs_double, gs_add, 0, gsh, 0); + else if(!strcmp(op, "mul")) + gs(v, gs_double, gs_mul, 0, gsh, 0); + else if(!strcmp(op, "min")) + gs(v, gs_double, gs_min, 0, gsh, 0); + else if(!strcmp(op, "max")) + gs(v, gs_double, gs_max, 0, gsh, 0); + } + + if(!strcmp(type, "int")){ + // printf("performing int gs\n"); + if(!strcmp(op, "add")) + gs(v, gs_int, gs_add, 0, gsh, 0); + else if(!strcmp(op, "mul")) + gs(v, gs_int, gs_mul, 0, gsh, 0); + else if(!strcmp(op, "min")) + gs(v, gs_int, gs_min, 0, gsh, 0); + else if(!strcmp(op, "max")) + gs(v, gs_int, gs_max, 0, gsh, 0); + } + + if(!strcmp(type, "long long int")){ + // printf("performing long_long gs\n"); + if(!strcmp(op, "add")) + gs(v, gs_long_long, gs_add, 0, gsh, 0); + else if(!strcmp(op, "mul")) + gs(v, gs_long_long, gs_mul, 0, gsh, 0); + else if(!strcmp(op, "min")) + gs(v, gs_long_long, gs_min, 0, gsh, 0); + else if(!strcmp(op, "max")) + gs(v, gs_long_long, gs_max, 0, gsh, 0); + } +} diff --git a/libs/gatherScatter/src/ogsHostGatherScatterMany.c b/libs/gatherScatter/src/ogsHostGatherScatterMany.c new file mode 100644 index 000000000..e9c4eb5fc --- /dev/null +++ b/libs/gatherScatter/src/ogsHostGatherScatterMany.c @@ -0,0 +1,82 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +/* compile with C compiler (not C++) */ + +#include +#include +#include +#include + +#include "gslib.h" + +void ogsHostGatherScatterMany(void **v, const int k, const char *type, const char *op, void *gsh){ + + /* need gs_float or gs_double */ + if(!strcmp(type, "float")){ + if(!strcmp(op, "add")) + gs_many(v, k, gs_float, gs_add, 0, gsh, 0); + else if(!strcmp(op, "mul")) + gs_many(v, k, gs_float, gs_mul, 0, gsh, 0); + else if(!strcmp(op, "min")) + gs_many(v, k, gs_float, gs_min, 0, gsh, 0); + else if(!strcmp(op, "max")) + gs_many(v, k, gs_float, gs_max, 0, gsh, 0); + } + + if(!strcmp(type, "double")){ + if(!strcmp(op, "add")) + gs_many(v, k, gs_double, gs_add, 0, gsh, 0); + else if(!strcmp(op, "mul")) + gs_many(v, k, gs_double, gs_mul, 0, gsh, 0); + else if(!strcmp(op, "min")) + gs_many(v, k, gs_double, gs_min, 0, gsh, 0); + else if(!strcmp(op, "max")) + gs_many(v, k, gs_double, gs_max, 0, gsh, 0); + } + + if(!strcmp(type, "int")){ + if(!strcmp(op, "add")) + gs_many(v, k, gs_int, gs_add, 0, gsh, 0); + else if(!strcmp(op, "mul")) + gs_many(v, k, gs_int, gs_mul, 0, gsh, 0); + else if(!strcmp(op, "min")) + gs_many(v, k, gs_int, gs_min, 0, gsh, 0); + else if(!strcmp(op, "max")) + gs_many(v, k, gs_int, gs_max, 0, gsh, 0); + } + + if(!strcmp(type, "long long int")){ + if(!strcmp(op, "add")) + gs_many(v, k, gs_long_long, gs_add, 0, gsh, 0); + else if(!strcmp(op, "mul")) + gs_many(v, k, gs_long_long, gs_mul, 0, gsh, 0); + else if(!strcmp(op, "min")) + gs_many(v, k, gs_long_long, gs_min, 0, gsh, 0); + else if(!strcmp(op, "max")) + gs_many(v, k, gs_long_long, gs_max, 0, gsh, 0); + } +} diff --git a/libs/gatherScatter/src/ogsHostGatherScatterVec.c b/libs/gatherScatter/src/ogsHostGatherScatterVec.c new file mode 100644 index 000000000..3ad53ed01 --- /dev/null +++ b/libs/gatherScatter/src/ogsHostGatherScatterVec.c @@ -0,0 +1,82 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +/* compile with C compiler (not C++) */ + +#include +#include +#include +#include + +#include "gslib.h" + +void ogsHostGatherScatterVec(void *v, const int k, const char *type, const char *op, void *gsh){ + + /* need gs_float or gs_double */ + if(!strcmp(type, "float")){ + if(!strcmp(op, "add")) + gs_vec(v, k, gs_float, gs_add, 0, gsh, 0); + else if(!strcmp(op, "mul")) + gs_vec(v, k, gs_float, gs_mul, 0, gsh, 0); + else if(!strcmp(op, "min")) + gs_vec(v, k, gs_float, gs_min, 0, gsh, 0); + else if(!strcmp(op, "max")) + gs_vec(v, k, gs_float, gs_max, 0, gsh, 0); + } + + if(!strcmp(type, "double")){ + if(!strcmp(op, "add")) + gs_vec(v, k, gs_double, gs_add, 0, gsh, 0); + else if(!strcmp(op, "mul")) + gs_vec(v, k, gs_double, gs_mul, 0, gsh, 0); + else if(!strcmp(op, "min")) + gs_vec(v, k, gs_double, gs_min, 0, gsh, 0); + else if(!strcmp(op, "max")) + gs_vec(v, k, gs_double, gs_max, 0, gsh, 0); + } + + if(!strcmp(type, "int")){ + if(!strcmp(op, "add")) + gs_vec(v, k, gs_int, gs_add, 0, gsh, 0); + else if(!strcmp(op, "mul")) + gs_vec(v, k, gs_int, gs_mul, 0, gsh, 0); + else if(!strcmp(op, "min")) + gs_vec(v, k, gs_int, gs_min, 0, gsh, 0); + else if(!strcmp(op, "max")) + gs_vec(v, k, gs_int, gs_max, 0, gsh, 0); + } + + if(!strcmp(type, "long long int")){ + if(!strcmp(op, "add")) + gs_vec(v, k, gs_long_long, gs_add, 0, gsh, 0); + else if(!strcmp(op, "mul")) + gs_vec(v, k, gs_long_long, gs_mul, 0, gsh, 0); + else if(!strcmp(op, "min")) + gs_vec(v, k, gs_long_long, gs_min, 0, gsh, 0); + else if(!strcmp(op, "max")) + gs_vec(v, k, gs_long_long, gs_max, 0, gsh, 0); + } +} diff --git a/libs/gatherScatter/src/ogsHostGatherVec.c b/libs/gatherScatter/src/ogsHostGatherVec.c new file mode 100644 index 000000000..13f72e77e --- /dev/null +++ b/libs/gatherScatter/src/ogsHostGatherVec.c @@ -0,0 +1,87 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +/* compile with C compiler (not C++) */ + +#include +#include +#include +#include + +#include "gslib.h" + +void ogsHostGatherVec(void *v, const int k, const char *type, const char *op, void *gsh){ + + /* need gs_float or gs_double */ + if(!strcmp(type, "float")){ + // printf("performing string gs on %s\n", type); + if(!strcmp(op, "add")) + gs_vec(v, k, gs_float, gs_add, 1, gsh, 0); + else if(!strcmp(op, "mul")) + gs_vec(v, k, gs_float, gs_mul, 1, gsh, 0); + else if(!strcmp(op, "min")) + gs_vec(v, k, gs_float, gs_min, 1, gsh, 0); + else if(!strcmp(op, "max")) + gs_vec(v, k, gs_float, gs_max, 1, gsh, 0); + } + + if(!strcmp(type, "double")){ + // printf("performing double gs on %s\n", type); + if(!strcmp(op, "add")) + gs_vec(v, k, gs_double, gs_add, 1, gsh, 0); + else if(!strcmp(op, "mul")) + gs_vec(v, k, gs_double, gs_mul, 1, gsh, 0); + else if(!strcmp(op, "min")) + gs_vec(v, k, gs_double, gs_min, 1, gsh, 0); + else if(!strcmp(op, "max")) + gs_vec(v, k, gs_double, gs_max, 1, gsh, 0); + } + + if(!strcmp(type, "int")){ + // printf("performing int gs\n"); + if(!strcmp(op, "add")) + gs_vec(v, k, gs_int, gs_add, 1, gsh, 0); + else if(!strcmp(op, "mul")) + gs_vec(v, k, gs_int, gs_mul, 1, gsh, 0); + else if(!strcmp(op, "min")) + gs_vec(v, k, gs_int, gs_min, 1, gsh, 0); + else if(!strcmp(op, "max")) + gs_vec(v, k, gs_int, gs_max, 1, gsh, 0); + } + + if(!strcmp(type, "long long int")){ + // printf("performing long_long gs\n"); + if(!strcmp(op, "add")) + gs_vec(v, k, gs_long_long, gs_add, 1, gsh, 0); + else if(!strcmp(op, "mul")) + gs_vec(v, k, gs_long_long, gs_mul, 1, gsh, 0); + else if(!strcmp(op, "min")) + gs_vec(v, k, gs_long_long, gs_min, 1, gsh, 0); + else if(!strcmp(op, "max")) + gs_vec(v, k, gs_long_long, gs_max, 1, gsh, 0); + } +} + diff --git a/libs/gatherScatter/src/ogsHostScatter.c b/libs/gatherScatter/src/ogsHostScatter.c new file mode 100644 index 000000000..c3aa2c740 --- /dev/null +++ b/libs/gatherScatter/src/ogsHostScatter.c @@ -0,0 +1,86 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +/* compile with C compiler (not C++) */ + +#include +#include +#include +#include + +#include "gslib.h" + +void ogsHostScatter(void *v, const char *type, const char *op, void *gsh){ + + /* need gs_float or gs_double */ + if(!strcmp(type, "float")){ + // printf("performing string gs on %s\n", type); + if(!strcmp(op, "add")) + gs(v, gs_float, gs_add, 0, gsh, 0); + else if(!strcmp(op, "mul")) + gs(v, gs_float, gs_mul, 0, gsh, 0); + else if(!strcmp(op, "min")) + gs(v, gs_float, gs_min, 0, gsh, 0); + else if(!strcmp(op, "max")) + gs(v, gs_float, gs_max, 0, gsh, 0); + } + + if(!strcmp(type, "double")){ + // printf("performing double gs on %s\n", type); + if(!strcmp(op, "add")) + gs(v, gs_double, gs_add, 0, gsh, 0); + else if(!strcmp(op, "mul")) + gs(v, gs_double, gs_mul, 0, gsh, 0); + else if(!strcmp(op, "min")) + gs(v, gs_double, gs_min, 0, gsh, 0); + else if(!strcmp(op, "max")) + gs(v, gs_double, gs_max, 0, gsh, 0); + } + + if(!strcmp(type, "int")){ + // printf("performing int gs\n"); + if(!strcmp(op, "add")) + gs(v, gs_int, gs_add, 0, gsh, 0); + else if(!strcmp(op, "mul")) + gs(v, gs_int, gs_mul, 0, gsh, 0); + else if(!strcmp(op, "min")) + gs(v, gs_int, gs_min, 0, gsh, 0); + else if(!strcmp(op, "max")) + gs(v, gs_int, gs_max, 0, gsh, 0); + } + + if(!strcmp(type, "long long int")){ + // printf("performing long_long gs\n"); + if(!strcmp(op, "add")) + gs(v, gs_long_long, gs_add, 0, gsh, 0); + else if(!strcmp(op, "mul")) + gs(v, gs_long_long, gs_mul, 0, gsh, 0); + else if(!strcmp(op, "min")) + gs(v, gs_long_long, gs_min, 0, gsh, 0); + else if(!strcmp(op, "max")) + gs(v, gs_long_long, gs_max, 0, gsh, 0); + } +} diff --git a/libs/gatherScatter/src/ogsHostScatterMany.c b/libs/gatherScatter/src/ogsHostScatterMany.c new file mode 100644 index 000000000..b890c4468 --- /dev/null +++ b/libs/gatherScatter/src/ogsHostScatterMany.c @@ -0,0 +1,86 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +/* compile with C compiler (not C++) */ + +#include +#include +#include +#include + +#include "gslib.h" + +void ogsHostScatterMany(void **v, const int k, const char *type, const char *op, void *gsh){ + + /* need gs_float or gs_double */ + if(!strcmp(type, "float")){ + // printf("performing string gs on %s\n", type); + if(!strcmp(op, "add")) + gs_many(v, k, gs_float, gs_add, 0, gsh, 0); + else if(!strcmp(op, "mul")) + gs_many(v, k, gs_float, gs_mul, 0, gsh, 0); + else if(!strcmp(op, "min")) + gs_many(v, k, gs_float, gs_min, 0, gsh, 0); + else if(!strcmp(op, "max")) + gs_many(v, k, gs_float, gs_max, 0, gsh, 0); + } + + if(!strcmp(type, "double")){ + // printf("performing double gs on %s\n", type); + if(!strcmp(op, "add")) + gs_many(v, k, gs_double, gs_add, 0, gsh, 0); + else if(!strcmp(op, "mul")) + gs_many(v, k, gs_double, gs_mul, 0, gsh, 0); + else if(!strcmp(op, "min")) + gs_many(v, k, gs_double, gs_min, 0, gsh, 0); + else if(!strcmp(op, "max")) + gs_many(v, k, gs_double, gs_max, 0, gsh, 0); + } + + if(!strcmp(type, "int")){ + // printf("performing int gs\n"); + if(!strcmp(op, "add")) + gs_many(v, k, gs_int, gs_add, 0, gsh, 0); + else if(!strcmp(op, "mul")) + gs_many(v, k, gs_int, gs_mul, 0, gsh, 0); + else if(!strcmp(op, "min")) + gs_many(v, k, gs_int, gs_min, 0, gsh, 0); + else if(!strcmp(op, "max")) + gs_many(v, k, gs_int, gs_max, 0, gsh, 0); + } + + if(!strcmp(type, "long long int")){ + // printf("performing long_long gs\n"); + if(!strcmp(op, "add")) + gs_many(v, k, gs_long_long, gs_add, 0, gsh, 0); + else if(!strcmp(op, "mul")) + gs_many(v, k, gs_long_long, gs_mul, 0, gsh, 0); + else if(!strcmp(op, "min")) + gs_many(v, k, gs_long_long, gs_min, 0, gsh, 0); + else if(!strcmp(op, "max")) + gs_many(v, k, gs_long_long, gs_max, 0, gsh, 0); + } +} diff --git a/libs/gatherScatter/src/ogsHostScatterVec.c b/libs/gatherScatter/src/ogsHostScatterVec.c new file mode 100644 index 000000000..513ea65ba --- /dev/null +++ b/libs/gatherScatter/src/ogsHostScatterVec.c @@ -0,0 +1,86 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +/* compile with C compiler (not C++) */ + +#include +#include +#include +#include + +#include "gslib.h" + +void ogsHostScatterVec(void *v, const int k, const char *type, const char *op, void *gsh){ + + /* need gs_float or gs_double */ + if(!strcmp(type, "float")){ + // printf("performing string gs on %s\n", type); + if(!strcmp(op, "add")) + gs_vec(v, k, gs_float, gs_add, 0, gsh, 0); + else if(!strcmp(op, "mul")) + gs_vec(v, k, gs_float, gs_mul, 0, gsh, 0); + else if(!strcmp(op, "min")) + gs_vec(v, k, gs_float, gs_min, 0, gsh, 0); + else if(!strcmp(op, "max")) + gs_vec(v, k, gs_float, gs_max, 0, gsh, 0); + } + + if(!strcmp(type, "double")){ + // printf("performing double gs on %s\n", type); + if(!strcmp(op, "add")) + gs_vec(v, k, gs_double, gs_add, 0, gsh, 0); + else if(!strcmp(op, "mul")) + gs_vec(v, k, gs_double, gs_mul, 0, gsh, 0); + else if(!strcmp(op, "min")) + gs_vec(v, k, gs_double, gs_min, 0, gsh, 0); + else if(!strcmp(op, "max")) + gs_vec(v, k, gs_double, gs_max, 0, gsh, 0); + } + + if(!strcmp(type, "int")){ + // printf("performing int gs\n"); + if(!strcmp(op, "add")) + gs_vec(v, k, gs_int, gs_add, 0, gsh, 0); + else if(!strcmp(op, "mul")) + gs_vec(v, k, gs_int, gs_mul, 0, gsh, 0); + else if(!strcmp(op, "min")) + gs_vec(v, k, gs_int, gs_min, 0, gsh, 0); + else if(!strcmp(op, "max")) + gs_vec(v, k, gs_int, gs_max, 0, gsh, 0); + } + + if(!strcmp(type, "long long int")){ + // printf("performing long_long gs\n"); + if(!strcmp(op, "add")) + gs_vec(v, k, gs_long_long, gs_add, 0, gsh, 0); + else if(!strcmp(op, "mul")) + gs_vec(v, k, gs_long_long, gs_mul, 0, gsh, 0); + else if(!strcmp(op, "min")) + gs_vec(v, k, gs_long_long, gs_min, 0, gsh, 0); + else if(!strcmp(op, "max")) + gs_vec(v, k, gs_long_long, gs_max, 0, gsh, 0); + } +} diff --git a/libs/gatherScatter/src/ogsHostSetup.c b/libs/gatherScatter/src/ogsHostSetup.c new file mode 100644 index 000000000..f654de848 --- /dev/null +++ b/libs/gatherScatter/src/ogsHostSetup.c @@ -0,0 +1,104 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +/* compile with C compiler (not C++) */ + +#include +#include +#include +#include + +#include "gslib.h" + +#include "types.h" + +void *ogsHostSetup(MPI_Comm meshComm, + dlong NuniqueBases, + hlong *gatherGlobalNodes, + int nonsymm, + int verbose){ + + /* gslib stuff */ + comm_ext world; + struct comm com; + + /* MPI_Comm_dup(MPI_COMM_WORLD, (MPI_Comm*) &world); */ + world = (comm_ext)meshComm; // MPI_COMM_WORLD; + + comm_init(&com, world); + + /* for the moment borrow gslib array */ + slong *id = tmalloc(slong, NuniqueBases); + + dlong n; + for(n=0;nNhaloGather) { + if (ogs::o_haloBuf.size() < ogs->NhaloGather*Nbytes) { + if (ogs::o_haloBuf.size()) ogs::o_haloBuf.free(); + ogs::o_haloBuf = ogs->device.mappedAlloc(ogs->NhaloGather*Nbytes); + ogs::haloBuf = ogs::o_haloBuf.getMappedPointer(); + } + } + + if(ogs->NlocalGather) { + occaScatter(ogs->NlocalGather, ogs->o_localGatherOffsets, ogs->o_localGatherIds, type, op, o_v, o_sv); + } + + if (ogs->NhaloGather) { + ogs->device.setStream(ogs::dataStream); + + if (ogs->NownedHalo) + o_v.copyTo(ogs::haloBuf, ogs->NownedHalo*Nbytes, + ogs->NlocalGather*Nbytes, "async: true"); + + ogs->device.setStream(ogs::defaultStream); + } +} + + +void ogsScatterFinish(occa::memory o_sv, + occa::memory o_v, + const char *type, + const char *op, + ogs_t *ogs){ + size_t Nbytes; + if (!strcmp(type, "float")) + Nbytes = sizeof(float); + else if (!strcmp(type, "double")) + Nbytes = sizeof(double); + else if (!strcmp(type, "int")) + Nbytes = sizeof(int); + else if (!strcmp(type, "long long int")) + Nbytes = sizeof(long long int); + + if (ogs->NhaloGather) { + ogs->device.setStream(ogs::dataStream); + ogs->device.finish(); + + // MPI based scatter using gslib + ogsHostScatter(ogs::haloBuf, type, op, ogs->haloGshNonSym); + + // copy totally scattered halo data back from HOST to DEVICE + ogs::o_haloBuf.copyFrom(ogs::haloBuf, ogs->NhaloGather*Nbytes, 0, "async: true"); + + ogs->device.finish(); + ogs->device.setStream(ogs::defaultStream); + + occaScatter(ogs->NhaloGather, ogs->o_haloGatherOffsets, ogs->o_haloGatherIds, type, op, ogs::o_haloBuf, o_sv); + } +} + +void ogsScatter(void *sv, + void *v, + const char *type, + const char *op, + ogs_t *ogs){ + + size_t Nbytes; + if (!strcmp(type, "float")) + Nbytes = sizeof(float); + else if (!strcmp(type, "double")) + Nbytes = sizeof(double); + else if (!strcmp(type, "int")) + Nbytes = sizeof(int); + else if (!strcmp(type, "long long int")) + Nbytes = sizeof(long long int); + + if (ogs->NhaloGather) { + if (ogs::hostBufSize < ogs->NhaloGather*Nbytes) { + if (ogs::hostBufSize) free(ogs::hostBuf); + ogs::hostBuf = (void *) malloc(ogs->NhaloGather*Nbytes); + } + } + + ogsScatter_op(sv, v, Nbytes, type, ogs); +} + +void ogsScatter_op(void *sv, void *v, const size_t Nbytes, const char *type, ogs_t *ogs){ + + if (!strcmp(type, "float")) + scatter(ogs->NlocalGather, ogs->localGatherOffsets, + ogs->localGatherIds, (float*)v, (float*)sv); + else if (!strcmp(type, "double")) + scatter(ogs->NlocalGather, ogs->localGatherOffsets, + ogs->localGatherIds, (double*)v, (double*)sv); + else if (!strcmp(type, "int")) + scatter(ogs->NlocalGather, ogs->localGatherOffsets, + ogs->localGatherIds, (int*)v, (int*)sv); + else if (!strcmp(type, "long long int")) + scatter(ogs->NlocalGather, ogs->localGatherOffsets, + ogs->localGatherIds, (long long int*)v, (long long int*)sv); + + if (ogs->NhaloGather) { + if (ogs->NownedHalo) + memcpy(ogs::hostBuf, (char*) v+ogs->NlocalGather*Nbytes, ogs->NownedHalo*Nbytes); + + // MPI based scatter using gslib + ogsHostScatter(ogs::hostBuf, type, ogsAdd, ogs->haloGshNonSym); + } + + if (!strcmp(type, "float")) + scatter(ogs->NhaloGather, ogs->haloGatherOffsets, + ogs->haloGatherIds, (float*)ogs::hostBuf, (float*)sv); + else if (!strcmp(type, "double")) + scatter(ogs->NhaloGather, ogs->haloGatherOffsets, + ogs->haloGatherIds, (double*)ogs::hostBuf, (double*)sv); + else if (!strcmp(type, "int")) + scatter(ogs->NhaloGather, ogs->haloGatherOffsets, + ogs->haloGatherIds, (int*)ogs::hostBuf, (int*)sv); + else if (!strcmp(type, "long long int")) + scatter(ogs->NhaloGather, ogs->haloGatherOffsets, + ogs->haloGatherIds, (long long int*)ogs::hostBuf, (long long int*)sv); +} + + +void occaScatter(const dlong Nscatter, + occa::memory o_scatterStarts, + occa::memory o_scatterIds, + const char* type, + const char* op, + occa::memory o_v, + occa::memory o_sv) { + + if (!strcmp(type, "float")) + ogs::scatterKernel_float(Nscatter, o_scatterStarts, o_scatterIds, o_v, o_sv); + else if (!strcmp(type, "double")) + ogs::scatterKernel_double(Nscatter, o_scatterStarts, o_scatterIds, o_v, o_sv); + else if (!strcmp(type, "int")) + ogs::scatterKernel_int(Nscatter, o_scatterStarts, o_scatterIds, o_v, o_sv); + else if (!strcmp(type, "long long int")) + ogs::scatterKernel_long(Nscatter, o_scatterStarts, o_scatterIds, o_v, o_sv); +} \ No newline at end of file diff --git a/libs/gatherScatter/src/ogsScatterMany.cpp b/libs/gatherScatter/src/ogsScatterMany.cpp new file mode 100644 index 000000000..882fc6e7c --- /dev/null +++ b/libs/gatherScatter/src/ogsScatterMany.cpp @@ -0,0 +1,241 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "ogs.hpp" +#include "ogsKernels.hpp" +#include "ogsInterface.h" + +#include "scatterMany.tpp" + +void ogsScatterMany_op(void *gv, void *v, + const int k, + const dlong sstride, + const dlong stride, + const size_t Nbytes, + const char *type, + ogs_t *ogs); + +void ogsScatterMany(occa::memory o_sv, + occa::memory o_v, + const int k, + const dlong sstride, + const dlong stride, + const char *type, + const char *op, + ogs_t *ogs){ + ogsScatterManyStart (o_sv, o_v, k, sstride, stride, type, op, ogs); + ogsScatterManyFinish(o_sv, o_v, k, sstride, stride, type, op, ogs); +} + +void ogsScatterManyStart(occa::memory o_sv, + occa::memory o_v, + const int k, + const dlong sstride, + const dlong stride, + const char *type, + const char *op, + ogs_t *ogs){ + size_t Nbytes; + if (!strcmp(type, "float")) + Nbytes = sizeof(float); + else if (!strcmp(type, "double")) + Nbytes = sizeof(double); + else if (!strcmp(type, "int")) + Nbytes = sizeof(int); + else if (!strcmp(type, "long long int")) + Nbytes = sizeof(long long int); + + if (ogs->NhaloGather) { + if (ogs::o_haloBuf.size() < ogs->NhaloGather*Nbytes*k) { + if (ogs::o_haloBuf.size()) ogs::o_haloBuf.free(); + ogs::o_haloBuf = ogs->device.mappedAlloc(ogs->NhaloGather*Nbytes*k); + ogs::haloBuf = ogs::o_haloBuf.getMappedPointer(); + } + } + + if(ogs->NlocalGather) { + occaScatterMany(ogs->NlocalGather, k, stride, sstride, ogs->o_localGatherOffsets, ogs->o_localGatherIds, type, op, o_v, o_sv); + } + + if (ogs->NhaloGather) { + ogs->device.setStream(ogs::dataStream); + + if (ogs->NownedHalo) { + for (int i=0;iNhaloGather*Nbytes*i, + ogs->NownedHalo*Nbytes, ogs->NlocalGather*Nbytes*i, "async: true"); + } + + ogs->device.setStream(ogs::defaultStream); + } +} + + +void ogsScatterManyFinish(occa::memory o_sv, + occa::memory o_v, + const int k, + const dlong sstride, + const dlong stride, + const char *type, + const char *op, + ogs_t *ogs){ + size_t Nbytes; + if (!strcmp(type, "float")) + Nbytes = sizeof(float); + else if (!strcmp(type, "double")) + Nbytes = sizeof(double); + else if (!strcmp(type, "int")) + Nbytes = sizeof(int); + else if (!strcmp(type, "long long int")) + Nbytes = sizeof(long long int); + + if (ogs->NhaloGather) { + ogs->device.setStream(ogs::dataStream); + ogs->device.finish(); + + void* H[k]; + for (int i=0;iNhaloGather*Nbytes; + + // MPI based scatter using gslib + ogsHostScatterMany(H, k, type, op, ogs->haloGshNonSym); + + // copy totally scattered halo data back from HOST to DEVICE + ogs::o_haloBuf.copyFrom(ogs::haloBuf, ogs->NhaloGather*Nbytes*k, 0, "async: true"); + + ogs->device.finish(); + ogs->device.setStream(ogs::defaultStream); + + occaScatterMany(ogs->NhaloGather, k, ogs->NhaloGather, sstride, ogs->o_haloGatherOffsets, ogs->o_haloGatherIds, type, op, ogs::o_haloBuf, o_sv); + } +} + +void ogsScatterMany(void *sv, + void *v, + const int k, + const dlong sstride, + const dlong stride, + const char *type, + const char *op, + ogs_t *ogs){ + + size_t Nbytes; + if (!strcmp(type, "float")) + Nbytes = sizeof(float); + else if (!strcmp(type, "double")) + Nbytes = sizeof(double); + else if (!strcmp(type, "int")) + Nbytes = sizeof(int); + else if (!strcmp(type, "long long int")) + Nbytes = sizeof(long long int); + + if (ogs->NhaloGather) { + if (ogs::hostBufSize < ogs->NhaloGather*Nbytes*k) { + if (ogs::hostBufSize) free(ogs::hostBuf); + ogs::hostBuf = (void *) malloc(ogs->NhaloGather*Nbytes*k); + } + } + + ogsScatterMany_op(sv, v, k, sstride, stride, Nbytes, type, ogs); +} + +void ogsScatterMany_op(void *sv, void *v, + const int k, + const dlong sstride, + const dlong stride, + const size_t Nbytes, + const char *type, + ogs_t *ogs){ + + if (!strcmp(type, "float")) + scatterMany(ogs->NlocalGather, k, stride, sstride, + ogs->localGatherOffsets, + ogs->localGatherIds, (float*)v, (float*)sv); + else if (!strcmp(type, "double")) + scatterMany(ogs->NlocalGather, k, stride, sstride, + ogs->localGatherOffsets, + ogs->localGatherIds, (double*)v, (double*)sv); + else if (!strcmp(type, "int")) + scatterMany(ogs->NlocalGather, k, stride, sstride, + ogs->localGatherOffsets, + ogs->localGatherIds, (int*)v, (int*)sv); + else if (!strcmp(type, "long long int")) + scatterMany(ogs->NlocalGather, k, stride, sstride, + ogs->localGatherOffsets, + ogs->localGatherIds, (long long int*)v, (long long int*)sv); + + if (ogs->NhaloGather) { + if (ogs->NownedHalo) + for (int i=0;iNhaloGather*Nbytes*i, + (char*) v+ogs->NlocalGather*Nbytes*i, + ogs->NownedHalo*Nbytes); + + + void* H[k]; + for (int i=0;iNhaloGather*Nbytes; + + // MPI based scatter using gslib + ogsHostScatterMany(H, k, type, ogsAdd, ogs->haloGshNonSym); + } + + if (!strcmp(type, "float")) + scatterMany(ogs->NhaloGather, k, ogs->NhaloGather, sstride, + ogs->haloGatherOffsets, + ogs->haloGatherIds, (float*)ogs::hostBuf, (float*)sv); + else if (!strcmp(type, "double")) + scatterMany(ogs->NhaloGather, k, ogs->NhaloGather, sstride, + ogs->haloGatherOffsets, + ogs->haloGatherIds, (double*)ogs::hostBuf, (double*)sv); + else if (!strcmp(type, "int")) + scatterMany(ogs->NhaloGather, k, ogs->NhaloGather, sstride, + ogs->haloGatherOffsets, + ogs->haloGatherIds, (int*)ogs::hostBuf, (int*)sv); + else if (!strcmp(type, "long long int")) + scatterMany(ogs->NhaloGather, k, ogs->NhaloGather, sstride, + ogs->haloGatherOffsets, + ogs->haloGatherIds, (long long int*)ogs::hostBuf, (long long int*)sv); +} + +void occaScatterMany(const dlong Nscatter, + const int Nentries, + const dlong stride, + const dlong sstride, + occa::memory o_scatterStarts, + occa::memory o_scatterIds, + const char* type, + const char* op, + occa::memory o_v, + occa::memory o_sv) { + + if (!strcmp(type, "float")) + ogs::scatterManyKernel_float(Nscatter, Nentries, stride, sstride, o_scatterStarts, o_scatterIds, o_v, o_sv); + else if (!strcmp(type, "double")) + ogs::scatterManyKernel_double(Nscatter, Nentries, stride, sstride, o_scatterStarts, o_scatterIds, o_v, o_sv); + else if (!strcmp(type, "int")) + ogs::scatterManyKernel_int(Nscatter, Nentries, stride, sstride, o_scatterStarts, o_scatterIds, o_v, o_sv); + else if (!strcmp(type, "long long int")) + ogs::scatterManyKernel_long(Nscatter, Nentries, stride, sstride, o_scatterStarts, o_scatterIds, o_v, o_sv); +} \ No newline at end of file diff --git a/libs/gatherScatter/src/ogsScatterVec.cpp b/libs/gatherScatter/src/ogsScatterVec.cpp new file mode 100644 index 000000000..6d90bff65 --- /dev/null +++ b/libs/gatherScatter/src/ogsScatterVec.cpp @@ -0,0 +1,199 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "ogs.hpp" +#include "ogsKernels.hpp" +#include "ogsInterface.h" + +#include "scatterVec.tpp" + +void ogsScatterVec_op(void *gv, void *v, const int k, const size_t Nbytes, const char *type, ogs_t *ogs); + +void ogsScatterVec(occa::memory o_sv, + occa::memory o_v, + const int k, + const char *type, + const char *op, + ogs_t *ogs){ + ogsScatterVecStart (o_sv, o_v, k, type, op, ogs); + ogsScatterVecFinish(o_sv, o_v, k, type, op, ogs); +} + +void ogsScatterVecStart(occa::memory o_sv, + occa::memory o_v, + const int k, + const char *type, + const char *op, + ogs_t *ogs){ + size_t Nbytes; + if (!strcmp(type, "float")) + Nbytes = sizeof(float); + else if (!strcmp(type, "double")) + Nbytes = sizeof(double); + else if (!strcmp(type, "int")) + Nbytes = sizeof(int); + else if (!strcmp(type, "long long int")) + Nbytes = sizeof(long long int); + + if (ogs->NhaloGather) { + if (ogs::o_haloBuf.size() < ogs->NhaloGather*Nbytes*k) { + if (ogs::o_haloBuf.size()) ogs::o_haloBuf.free(); + ogs::o_haloBuf = ogs->device.mappedAlloc(ogs->NhaloGather*Nbytes*k); + ogs::haloBuf = ogs::o_haloBuf.getMappedPointer(); + } + } + + if(ogs->NlocalGather) { + occaScatterVec(ogs->NlocalGather, k, ogs->o_localGatherOffsets, ogs->o_localGatherIds, type, op, o_v, o_sv); + } + + if (ogs->NhaloGather) { + ogs->device.setStream(ogs::dataStream); + + if (ogs->NownedHalo) + o_v.copyTo(ogs::haloBuf, ogs->NownedHalo*Nbytes*k, + ogs->NlocalGather*Nbytes*k, "async: true"); + + ogs->device.setStream(ogs::defaultStream); + } +} + + +void ogsScatterVecFinish(occa::memory o_sv, + occa::memory o_v, + const int k, + const char *type, + const char *op, + ogs_t *ogs){ + size_t Nbytes; + if (!strcmp(type, "float")) + Nbytes = sizeof(float); + else if (!strcmp(type, "double")) + Nbytes = sizeof(double); + else if (!strcmp(type, "int")) + Nbytes = sizeof(int); + else if (!strcmp(type, "long long int")) + Nbytes = sizeof(long long int); + + if (ogs->NhaloGather) { + ogs->device.setStream(ogs::dataStream); + ogs->device.finish(); + + // MPI based scatter using gslib + ogsHostScatterVec(ogs::haloBuf, k, type, op, ogs->haloGshNonSym); + + // copy totally scattered halo data back from HOST to DEVICE + ogs::o_haloBuf.copyFrom(ogs::haloBuf, ogs->NhaloGather*Nbytes*k, 0, "async: true"); + + ogs->device.finish(); + ogs->device.setStream(ogs::defaultStream); + + occaScatterVec(ogs->NhaloGather, k, ogs->o_haloGatherOffsets, ogs->o_haloGatherIds, type, op, ogs::o_haloBuf, o_sv); + } +} + +void ogsScatterVec(void *sv, + void *v, + const int k, + const char *type, + const char *op, + ogs_t *ogs){ + + size_t Nbytes; + if (!strcmp(type, "float")) + Nbytes = sizeof(float); + else if (!strcmp(type, "double")) + Nbytes = sizeof(double); + else if (!strcmp(type, "int")) + Nbytes = sizeof(int); + else if (!strcmp(type, "long long int")) + Nbytes = sizeof(long long int); + + if (ogs->NhaloGather) { + if (ogs::hostBufSize < ogs->NhaloGather*Nbytes*k) { + if (ogs::hostBufSize) free(ogs::hostBuf); + ogs::hostBuf = (void *) malloc(ogs->NhaloGather*Nbytes*k); + } + } + + ogsScatterVec_op(sv, v, k, Nbytes, type, ogs); +} + +void ogsScatterVec_op(void *sv, void *v, const int k, const size_t Nbytes, const char *type, ogs_t *ogs){ + + if (!strcmp(type, "float")) + scatterVec(ogs->NlocalGather, k, ogs->localGatherOffsets, + ogs->localGatherIds, (float*)v, (float*)sv); + else if (!strcmp(type, "double")) + scatterVec(ogs->NlocalGather, k, ogs->localGatherOffsets, + ogs->localGatherIds, (double*)v, (double*)sv); + else if (!strcmp(type, "int")) + scatterVec(ogs->NlocalGather, k, ogs->localGatherOffsets, + ogs->localGatherIds, (int*)v, (int*)sv); + else if (!strcmp(type, "long long int")) + scatterVec(ogs->NlocalGather, k, ogs->localGatherOffsets, + ogs->localGatherIds, (long long int*)v, (long long int*)sv); + + if (ogs->NhaloGather) { + if (ogs->NownedHalo) + memcpy(ogs::hostBuf, (char*) v+ogs->NlocalGather*Nbytes*k, ogs->NownedHalo*Nbytes*k); + + // MPI based scatterVec using gslib + ogsHostScatterVec(ogs::hostBuf, k, type, ogsAdd, ogs->haloGshNonSym); + } + + if (!strcmp(type, "float")) + scatterVec(ogs->NhaloGather, k, ogs->haloGatherOffsets, + ogs->haloGatherIds, (float*)ogs::hostBuf, (float*)sv); + else if (!strcmp(type, "double")) + scatterVec(ogs->NhaloGather, k, ogs->haloGatherOffsets, + ogs->haloGatherIds, (double*)ogs::hostBuf, (double*)sv); + else if (!strcmp(type, "int")) + scatterVec(ogs->NhaloGather, k, ogs->haloGatherOffsets, + ogs->haloGatherIds, (int*)ogs::hostBuf, (int*)sv); + else if (!strcmp(type, "long long int")) + scatterVec(ogs->NhaloGather, k, ogs->haloGatherOffsets, + ogs->haloGatherIds, (long long int*)ogs::hostBuf, (long long int*)sv); +} + +void occaScatterVec(const dlong Nscatter, + const int Nentries, + occa::memory o_scatterStarts, + occa::memory o_scatterIds, + const char* type, + const char* op, + occa::memory o_v, + occa::memory o_sv) { + + if (!strcmp(type, "float")) + ogs::scatterVecKernel_float(Nscatter, Nentries, o_scatterStarts, o_scatterIds, o_v, o_sv); + else if (!strcmp(type, "double")) + ogs::scatterVecKernel_double(Nscatter, Nentries, o_scatterStarts, o_scatterIds, o_v, o_sv); + else if (!strcmp(type, "int")) + ogs::scatterVecKernel_int(Nscatter, Nentries, o_scatterStarts, o_scatterIds, o_v, o_sv); + else if (!strcmp(type, "long long int")) + ogs::scatterVecKernel_long(Nscatter, Nentries, o_scatterStarts, o_scatterIds, o_v, o_sv); +} \ No newline at end of file diff --git a/libs/gatherScatter/src/ogsSetup.cpp b/libs/gatherScatter/src/ogsSetup.cpp new file mode 100644 index 000000000..81ef56acb --- /dev/null +++ b/libs/gatherScatter/src/ogsSetup.cpp @@ -0,0 +1,367 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "ogs.hpp" +#include "ogsKernels.hpp" +#include "ogsInterface.h" + +typedef struct{ + + dlong localId; // local node id + hlong baseId; // original global index + + dlong newId; // new global id + int owned; + +}parallelNode_t; + +// compare on baseId then by localId +int compareBaseId(const void *a, const void *b){ + + parallelNode_t *fa = (parallelNode_t*) a; + parallelNode_t *fb = (parallelNode_t*) b; + + if(abs(fa->baseId) < abs(fb->baseId)) return -1; //group by abs(baseId) + if(abs(fa->baseId) > abs(fb->baseId)) return +1; + + if(fa->localId < fb->localId) return -1; //sort by local id + if(fa->localId > fb->localId) return +1; + + return 0; +} + +// compare on haloOwned then localId +int compareLocalId(const void *a, const void *b){ + + parallelNode_t *fa = (parallelNode_t*) a; + parallelNode_t *fb = (parallelNode_t*) b; + + if(fa->localId < fb->localId) return -1; + if(fa->localId > fb->localId) return +1; + + return 0; +} + +ogs_t *ogsSetup(dlong N, hlong *ids, MPI_Comm &comm, + int verbose, occa::device device){ + + ogs_t *ogs = (ogs_t*) calloc(1, sizeof(ogs_t)); + + //Keep track of how many gs handles we've created, and + // build kernels if this is the first + if (!ogs::Nrefs) ogs::initKernels(comm, device); + ogs::Nrefs++; + + ogs->N = N; + ogs->comm = comm; + + int rank, size; + MPI_Comm_rank(ogs->comm, &rank); + MPI_Comm_size(ogs->comm, &size); + + //make a host gs handle (calls gslib) + ogs->hostGsh = ogsHostSetup(comm, N, ids, 0, 0); + + //use the host gs to find what nodes are local to this rank + int *minRank = (int *) calloc(N,sizeof(int)); + int *maxRank = (int *) calloc(N,sizeof(int)); + hlong *flagIds = (hlong *) calloc(N,sizeof(hlong)); + for (dlong i=0;ihostGsh); //minRank[n] contains the smallest rank taking part in the gather of node n + ogsHostGatherScatter(maxRank, ogsInt, ogsMax, ogs->hostGsh); //maxRank[n] contains the largest rank taking part in the gather of node n + ogsGsUnique(flagIds, N, comm); //one unique node in each group is 'flagged' kept positive while others are turned negative. + + //count local and halo nodes + ogs->Nlocal=0; ogs->Nhalo=0; ogs->NownedHalo=0; + for (dlong i=0;iNhalo++; + if (flagIds[i]>0) ogs->NownedHalo++; + } else { + ogs->Nlocal++; + } + } + + //set up the local gatherScatter + parallelNode_t *localNodes; + + if (ogs->Nlocal) { + localNodes = (parallelNode_t*) calloc(ogs->Nlocal,sizeof(parallelNode_t)); + + dlong cnt=0; + for (dlong i=0;iNlocal, sizeof(parallelNode_t), compareBaseId); + + ogs->NlocalGather = 0; + localNodes[0].newId = 0; + localNodes[0].owned = 1; + for (dlong i=1;iNlocal;i++) { + int s = 0; + if (localNodes[i].baseId!=localNodes[i-1].baseId) { + ogs->NlocalGather++; + s = 1; + } + localNodes[i].newId = ogs->NlocalGather; + localNodes[i].owned = s; + } + ogs->NlocalGather++; + + // sort based on local ids + qsort(localNodes, ogs->Nlocal, sizeof(parallelNode_t), compareLocalId); + + //tally up how many nodes are being gathered to each gatherNode and + // map to a local ordering + dlong *localGatherCounts = (dlong*) calloc(ogs->NlocalGather,sizeof(dlong)); + dlong *localGatherMap = (dlong*) calloc(ogs->NlocalGather,sizeof(dlong)); + cnt = 0; + for (dlong i=0;iNlocal;i++) { + dlong newId = localNodes[i].newId; //get the ordered id + + if (localNodes[i].owned) + localGatherMap[newId] = cnt++; //record a new index if this is a new gatherNode + + localNodes[i].newId = localGatherMap[newId]; //reorder + localGatherCounts[localGatherMap[newId]]++; //tally + } + free(localGatherMap); + + ogs->localGatherOffsets = (dlong*) calloc(ogs->NlocalGather+1,sizeof(dlong)); + for (dlong i=0;iNlocalGather;i++) { + ogs->localGatherOffsets[i+1] = ogs->localGatherOffsets[i] + localGatherCounts[i]; + localGatherCounts[i] = 0; + } + + ogs->localGatherIds = (dlong*) calloc(ogs->Nlocal,sizeof(dlong)); + for (dlong i=0;iNlocal;i++) { + dlong gatherId = localNodes[i].newId; + dlong offset = ogs->localGatherOffsets[gatherId]; + int index = localGatherCounts[gatherId]; + + ogs->localGatherIds[offset+index] = localNodes[i].localId; + localGatherCounts[gatherId]++; + } + free(localGatherCounts); + + ogs->o_localGatherOffsets = device.malloc((ogs->NlocalGather+1)*sizeof(dlong), ogs->localGatherOffsets); + ogs->o_localGatherIds = device.malloc((ogs->Nlocal)*sizeof(dlong), ogs->localGatherIds); + + free(localNodes); + } + + //set up the halo gatherScatter + parallelNode_t *haloNodes; + if (ogs->Nhalo) { + haloNodes = (parallelNode_t*) calloc(ogs->Nhalo,sizeof(parallelNode_t)); + + dlong cnt=0; + for (dlong i=0;iNhalo, sizeof(parallelNode_t), compareBaseId); + + //move the flagged node to the lowest local index if present + cnt = 0; + ogs->NhaloGather=0; + haloNodes[0].newId = 0; + haloNodes[0].owned = 1; + + for (dlong i=1;iNhalo;i++) { + int s = 0; + if (abs(haloNodes[i].baseId)!=abs(haloNodes[i-1].baseId)) { //new gather node + s = 1; + cnt = i; + ogs->NhaloGather++; + } + + haloNodes[i].owned = s; + haloNodes[i].newId = ogs->NhaloGather; + if (haloNodes[i].baseId>0) { + haloNodes[i].baseId = -abs(haloNodes[i].baseId); + haloNodes[cnt].baseId = abs(haloNodes[cnt].baseId); + } + } + ogs->NhaloGather++; + + // sort based on local ids + qsort(haloNodes, ogs->Nhalo, sizeof(parallelNode_t), compareLocalId); + + //tally up how many nodes are being gathered to each gatherNode and + // map to a local ordering + dlong *haloGatherCounts = (dlong*) calloc(ogs->NhaloGather,sizeof(dlong)); + dlong *haloGatherMap = (dlong*) calloc(ogs->NhaloGather,sizeof(dlong)); + hlong *symIds = (hlong *) calloc(ogs->NhaloGather,sizeof(hlong)); + hlong *nonSymIds = (hlong *) calloc(ogs->NhaloGather,sizeof(hlong)); + + cnt = 0; + dlong cnt2 = ogs->NownedHalo; + for (dlong i=0;iNhalo;i++) { + dlong newId = haloNodes[i].newId; //get the ordered id + + if (haloNodes[i].owned) { + dlong c; + if (haloNodes[i].baseId>0) + c = cnt++; + else + c = cnt2++; + + symIds[c] = abs(haloNodes[i].baseId); //record the base id + nonSymIds[c] = haloNodes[i].baseId; //record the base id + haloGatherMap[newId] = c; //record a new index if this is a new gatherNode + } + + haloNodes[i].newId = haloGatherMap[newId]; //reorder + haloGatherCounts[haloGatherMap[newId]]++; //tally + } + free(haloGatherMap); + + ogs->haloGatherOffsets = (dlong*) calloc(ogs->NhaloGather+1,sizeof(dlong)); + for (dlong i=0;iNhaloGather;i++) { + ogs->haloGatherOffsets[i+1] = ogs->haloGatherOffsets[i] + haloGatherCounts[i]; + haloGatherCounts[i] = 0; + } + + ogs->haloGatherIds = (dlong*) calloc(ogs->Nhalo,sizeof(dlong)); + for (dlong i=0;iNhalo;i++) { + dlong gatherId = haloNodes[i].newId; + dlong offset = ogs->haloGatherOffsets[gatherId]; + int index = haloGatherCounts[gatherId]; + + ogs->haloGatherIds[offset+index] = haloNodes[i].localId; + haloGatherCounts[gatherId]++; + } + free(haloGatherCounts); + + ogs->o_haloGatherOffsets = device.malloc((ogs->NhaloGather+1)*sizeof(dlong), ogs->haloGatherOffsets); + ogs->o_haloGatherIds = device.malloc((ogs->Nhalo)*sizeof(dlong), ogs->haloGatherIds); + + //make a host gs handle + ogs->haloGshSym = ogsHostSetup(comm, ogs->NhaloGather, symIds, 0,0); + ogs->haloGshNonSym = ogsHostSetup(comm, ogs->NhaloGather, nonSymIds, 0,0); + + free(symIds); free(nonSymIds); + free(haloNodes); + } + free(minRank); free(maxRank); free(flagIds); + + //total number of owned gathered nodes + ogs->Ngather = ogs->NlocalGather+ogs->NownedHalo; + + ogs->device = device; + + // build degree vectors + ogs->invDegree = (dfloat*) calloc(N, sizeof(dfloat)); + ogs->gatherInvDegree = (dfloat*) calloc(ogs->Ngather, sizeof(dfloat)); + for(dlong n=0;ninvDegree[n] = 1; + + ogs->o_invDegree = device.malloc(N*sizeof(dfloat), ogs->invDegree); + ogs->o_gatherInvDegree = device.malloc(ogs->Ngather*sizeof(dfloat), ogs->gatherInvDegree); + + ogsGather(ogs->o_gatherInvDegree, ogs->o_invDegree, ogsDfloat, ogsAdd, ogs); + + if(ogs->Ngather) + ogs->o_gatherInvDegree.copyTo(ogs->gatherInvDegree); + + ogsScatter(ogs->o_invDegree, ogs->o_gatherInvDegree, ogsDfloat, ogsAdd, ogs); + + ogs->o_invDegree.copyTo(ogs->invDegree); + + for(dlong n=0;nN;++n) + ogs->invDegree[n] = 1./ogs->invDegree[n]; + + for(dlong n=0;nNgather;++n) + ogs->gatherInvDegree[n] = 1./ogs->gatherInvDegree[n]; + + if(ogs->Ngather) + ogs->o_gatherInvDegree.copyFrom(ogs->gatherInvDegree); + + if(ogs->N) + ogs->o_invDegree.copyFrom(ogs->invDegree); + + return ogs; +} + + +void ogsFree(ogs_t *ogs) { + + if (ogs->Nlocal) { + free(ogs->localGatherOffsets); + free(ogs->localGatherIds); + ogs->o_localGatherOffsets.free(); + ogs->o_localGatherIds.free(); + } + + if (ogs->Nhalo) { + free(ogs->haloGatherOffsets); + free(ogs->haloGatherIds); + ogs->o_haloGatherOffsets.free(); + ogs->o_haloGatherIds.free(); + ogsHostFree(ogs->haloGshSym); + ogsHostFree(ogs->haloGshNonSym); + } + + if (ogs->N) { + free(ogs->invDegree); + ogs->o_invDegree.free(); + ogsHostFree(ogs->hostGsh); + } + + if (ogs->Ngather) { + free(ogs->gatherInvDegree); + ogs->o_gatherInvDegree.free(); + } + + free(ogs); + + ogs::Nrefs--; + if (!ogs::Nrefs) ogs::freeKernels(); +} \ No newline at end of file diff --git a/solvers/acoustics/makefile b/solvers/acoustics/makefile index 7eb778142..e8eb555e3 100644 --- a/solvers/acoustics/makefile +++ b/solvers/acoustics/makefile @@ -9,6 +9,7 @@ include ${OCCA_DIR}/scripts/Makefile # define variables HDRDIR = ../../include +OGSDIR = ../../libs/gatherScatter # set options for this machine # specify which compilers to use for c, fortran and linking @@ -16,7 +17,7 @@ CC = mpic++ LD = mpic++ # compiler flags to be used (set to compile with debugging on) -CFLAGS = -I. -DOCCA_VERSION_1_0 $(compilerFlags) $(flags) -I$(HDRDIR) -g -D DHOLMES='"${CURDIR}/../.."' -D DACOUSTICS='"${CURDIR}"' +CFLAGS = -I. -DOCCA_VERSION_1_0 $(compilerFlags) $(flags) -I$(HDRDIR) -I$(OGSDIR) -g -D DHOLMES='"${CURDIR}/../.."' -D DACOUSTICS='"${CURDIR}"' # link flags to be used LDFLAGS = -DOCCA_VERSION_1_0 $(compilerFlags) $(flags) -g diff --git a/solvers/bns/makefile b/solvers/bns/makefile index 252cb54ec..7f6f2e355 100644 --- a/solvers/bns/makefile +++ b/solvers/bns/makefile @@ -9,7 +9,8 @@ include ${OCCA_DIR}/scripts/Makefile # define variables HDRDIR = ../../include -GSDIR = ../../3rdParty/gslib.github/src +GSDIR = ../../3rdParty/gslib +OGSDIR = ../../libs/gatherScatter # set options for this machine # specify which compilers to use for c, fortran and linking @@ -18,20 +19,21 @@ CC = mpic++ LD = mpic++ # compiler flags to be used (set to compile with debugging on) -CFLAGS = -I. -DOCCA_VERSION_1_0 $(compilerFlags) $(flags) -I$(HDRDIR) -I$(GSDIR) -g -D DHOLMES='"${CURDIR}/../.."' -D DBNS='"${CURDIR}"' +CFLAGS = -I. -DOCCA_VERSION_1_0 $(compilerFlags) $(flags) -I$(HDRDIR) -I$(OGSDIR) -g -D DHOLMES='"${CURDIR}/../.."' -D DBNS='"${CURDIR}"' # link flags to be used -LDFLAGS = -DOCCA_VERSION_1_0 $(compilerFlags) -g -L../../3rdParty/gslib.github -lgs -fopenmp +LDFLAGS = -DOCCA_VERSION_1_0 $(compilerFlags) -g -fopenmp # libraries to be linked in -LIBS = -L$(OCCA_DIR)/lib $(links) +LIBS = -L$(OGSDIR) -logs -L$(GSDIR)/lib -lgs \ + -L$(OCCA_DIR)/lib $(links) INCLUDES = bns.h DEPS = $(INCLUDES) \ $(HDRDIR)/mesh.h \ $(HDRDIR)/mesh2D.h \ $(HDRDIR)/mesh3D.h \ -$(HDRDIR)/ogs_t.h +$(OGSDIR)/ogs.hpp # types of files we are going to construct rules for .SUFFIXES: .c @@ -124,31 +126,20 @@ LOBJS = \ ../../src/setupAide.o \ ../../src/trace.o \ ../../src/readArray.o \ -../../src/meshParallelConsecutiveGlobalNumbering.o\ -../../src/meshParallelGatherScatter.o \ ../../src/meshParallelGatherScatterSetup.o \ ../../src/occaDeviceConfig.o\ ../../src/occaHostMallocPinned.o \ ../../src/timer.o -COBJS = \ -../../src/gsParallelGatherScatter.o\ -../../src/gsParallelGatherScatterSetup.o\ - -bnsMain:$(AOBJS) $(LOBJS) ./src/bnsMain.o gslibInterface +bnsMain:$(AOBJS) $(LOBJS) ./src/bnsMain.o libogs $(LD) $(LDFLAGS) -o bnsMain $(COBJS) $(AOBJS) $(LOBJS) $(paths) $(LIBS) - - -gslibInterface: - cd ../../3rdParty/gslib.github; make -j libgs.a; cd ../../solvers/bns - $(cc) $(CFLAGS) -c -o ../../src/gsParallelGatherScatter.o ../../src/gsParallelGatherScatter.c $(paths) - $(cc) $(CFLAGS) -c -o ../../src/gsParallelGatherScatterSetup.o ../../src/gsParallelGatherScatterSetup.c $(paths) - +libogs: + cd ../../libs/gatherScatter; make -j lib; cd ../../solvers/bns # what to do if user types "make clean" clean : - cd ../../3rdParty/gslib.github; make clean; cd ../../solvers/bns + cd ../../libs/gatherScatter; make clean; cd ../../solvers/bns rm -r $(AOBJS) $(LOBJS) bnsMain diff --git a/solvers/bns/src/bnsReport.c b/solvers/bns/src/bnsReport.c index 721d7684d..5a7275a52 100644 --- a/solvers/bns/src/bnsReport.c +++ b/solvers/bns/src/bnsReport.c @@ -38,7 +38,7 @@ mesh_t *mesh = bns->mesh; bns->o_VortMag); if(bns->dim==3){ - meshParallelGatherScatter(mesh, mesh->ogs, bns->o_VortMag); + ogsGatherScatter(bns->o_VortMag, ogsDfloat, ogsAdd, mesh->ogs); int Ntotal = mesh->Np*mesh->Nelements; bns->dotMultiplyKernel(Ntotal, bns->o_VortMag, mesh->ogs->o_invDegree); } diff --git a/solvers/bns/src/bnsSetup.c b/solvers/bns/src/bnsSetup.c index 4f4313980..88ff29d64 100644 --- a/solvers/bns/src/bnsSetup.c +++ b/solvers/bns/src/bnsSetup.c @@ -876,16 +876,11 @@ if(options.compareArgs("TIME INTEGRATOR","SARK")){ - // Setup Gather Scales - + // Setup GatherScatter if(bns->dim==3){ int verbose = 1; - mesh->ogs = meshParallelGatherScatterSetup(mesh,mesh->Np*mesh->Nelements, - mesh->gatherLocalIds, - mesh->gatherBaseIds, - mesh->gatherBaseRanks, - mesh->gatherHaloFlags, - verbose); + dlong Ntotal = mesh->Np*mesh->Nelements; + meshParallelGatherScatterSetup(mesh, Ntotal, mesh->globalIds, mesh->comm, verbose); } return bns; diff --git a/solvers/cns/makefile b/solvers/cns/makefile index 1c478735e..8d20445c2 100644 --- a/solvers/cns/makefile +++ b/solvers/cns/makefile @@ -9,6 +9,8 @@ include ${OCCA_DIR}/scripts/Makefile # define variables HDRDIR = ../../include +GSDIR = ../../3rdParty/gslib +OGSDIR = ../../libs/gatherScatter # set options for this machine # specify which compilers to use for c, fortran and linking @@ -16,19 +18,21 @@ CC = mpic++ LD = mpic++ # compiler flags to be used (set to compile with debugging on) -CFLAGS = -I. -DOCCA_VERSION_1_0 $(compilerFlags) $(flags) -I$(HDRDIR) -g -D DHOLMES='"${CURDIR}/../.."' -D DCNS='"${CURDIR}"' +CFLAGS = -I. -DOCCA_VERSION_1_0 $(compilerFlags) $(flags) -I$(HDRDIR) -I$(OGSDIR) -g -D DHOLMES='"${CURDIR}/../.."' -D DCNS='"${CURDIR}"' # link flags to be used LDFLAGS = -DOCCA_VERSION_1_0 $(compilerFlags) $(flags) -g # libraries to be linked in -LIBS = -L$(OCCA_DIR)/lib $(links) +LIBS = -L$(OGSDIR) -logs -L$(GSDIR)/lib -lgs \ + -L$(OCCA_DIR)/lib $(links) INCLUDES = cns.h DEPS = $(INCLUDES) \ $(HDRDIR)/mesh.h \ -$(HDRDIR)/mesh3D.h +$(HDRDIR)/mesh3D.h \ +$(OGSDIR)/ogs.hpp # types of files we are going to construct rules for .SUFFIXES: .c @@ -105,9 +109,12 @@ OBJS = \ ../../src/timer.o -cnsMain:$(OBJS) +cnsMain:$(OBJS) libogs $(LD) $(LDFLAGS) -o cnsMain $(OBJS) $(paths) $(LIBS) +libogs: + cd ../../libs/gatherScatter; make -j lib; cd ../../solvers/cns + # what to do if user types "make clean" clean : rm -r $(OBJS) cnsMain diff --git a/solvers/elliptic/elliptic.h b/solvers/elliptic/elliptic.h index ac354a21e..6b6a50597 100644 --- a/solvers/elliptic/elliptic.h +++ b/solvers/elliptic/elliptic.h @@ -110,14 +110,6 @@ typedef struct { occa::memory o_EXYZ; // element vertices for reconstructing geofacs (trilinear hexes only) occa::memory o_gllzw; // GLL nodes and weights - - // list of elements that are needed for global gather-scatter - dlong NglobalGatherElements; - occa::memory o_globalGatherElementList; - - // list of elements that are not needed for global gather-scatter - dlong NlocalGatherElements; - occa::memory o_localGatherElementList; occa::kernel AxKernel; occa::kernel partialAxKernel; @@ -144,9 +136,6 @@ typedef struct { elliptic_t *ellipticSetup(mesh2D *mesh, dfloat lambda, occa::properties &kernelInfo, setupAide options); -void ellipticParallelGatherScatter(mesh2D *mesh, ogs_t *ogs, occa::memory &o_v, const char *type, const char *op); -void ellipticParallelGatherScatterSetup(elliptic_t *elliptic); - void ellipticPreconditioner(elliptic_t *elliptic, dfloat lambda, occa::memory &o_r, occa::memory &o_z); void ellipticPreconditionerSetup(elliptic_t *elliptic, ogs_t *ogs, dfloat lambda); diff --git a/solvers/elliptic/makefile b/solvers/elliptic/makefile index 82990cdde..531dcac9b 100644 --- a/solvers/elliptic/makefile +++ b/solvers/elliptic/makefile @@ -9,7 +9,8 @@ include ${OCCA_DIR}/scripts/Makefile # define variables HDRDIR = ../../include -GSDIR = ../../3rdParty/gslib.github/src +GSDIR = ../../3rdParty/gslib +OGSDIR = ../../libs/gatherScatter ALMONDDIR = ../parALMOND # set options for this machine @@ -19,15 +20,15 @@ CC = mpic++ LD = mpic++ # compiler flags to be used (set to compile with debugging on) -CFLAGS = -I. -DOCCA_VERSION_1_0 $(compilerFlags) $(flags) -I$(HDRDIR) -I$(GSDIR) -g -D DHOLMES='"${CURDIR}/../.."' -D DELLIPTIC='"${CURDIR}"' +CFLAGS = -I. -DOCCA_VERSION_1_0 $(compilerFlags) $(flags) -I$(HDRDIR) -I$(OGSDIR) -g -D DHOLMES='"${CURDIR}/../.."' -D DELLIPTIC='"${CURDIR}"' # link flags to be used -LDFLAGS = -DOCCA_VERSION_1_0 $(compilerFlags) $(flags) -g -L../../3rdParty/gslib.github -lgs \ - -L$(ALMONDDIR) -lparALMOND +LDFLAGS = -DOCCA_VERSION_1_0 $(compilerFlags) $(flags) -g # libraries to be linked in -LIBS = -L$(OCCA_DIR)/lib $(links) -L../../3rdParty/BlasLapack -lBlasLapack -lgfortran +LIBS = -L$(ALMONDDIR) -lparALMOND -L$(OGSDIR) -logs -L$(GSDIR)/lib -lgs \ + -L$(OCCA_DIR)/lib $(links) -L../../3rdParty/BlasLapack -lBlasLapack -lgfortran #-llapack -lblas @@ -58,8 +59,6 @@ AOBJS = \ ./src/ellipticHaloExchange.o\ ./src/ellipticMultiGridSetup.o \ ./src/ellipticOperator.o \ -./src/ellipticParallelGatherScatter.o \ -./src/ellipticParallelGatherScatterSetup.o \ ./src/ellipticPreconditioner.o\ ./src/ellipticPreconditionerSetup.o\ ./src/ellipticSEMFEMSetup.o\ @@ -94,8 +93,6 @@ LOBJS = \ ../../src/meshOccaSetup3D.o \ ../../src/meshParallelConnectNodes.o \ ../../src/meshParallelConnectOpt.o \ -../../src/meshParallelConsecutiveGlobalNumbering.o\ -../../src/meshParallelGatherScatter.o \ ../../src/meshParallelGatherScatterSetup.o \ ../../src/meshParallelReaderTri2D.o \ ../../src/meshParallelReaderQuad2D.o \ @@ -130,25 +127,20 @@ LOBJS = \ ../../src/occaHostMallocPinned.o \ ../../src/timer.o -COBJS = \ -../../src/gsParallelGatherScatter.o\ -../../src/gsParallelGatherScatterSetup.o\ -../../src/xxtCoarseSolve.o - - -ellipticMain:$(AOBJS) $(LOBJS) ./src/ellipticMain.o gslibInterface - cd ../../3rdParty/BlasLapack; make -j lib; cd ../../solvers/elliptic - cd ../parALMOND; make -j lib; cd ../elliptic +ellipticMain:$(AOBJS) $(LOBJS) ./src/ellipticMain.o libblas libogs libparALMOND $(LD) $(LDFLAGS) -o ellipticMain ./src/ellipticMain.o $(COBJS) $(AOBJS) $(LOBJS) $(paths) $(LIBS) lib:$(AOBJS) ar -cr libelliptic.a $(AOBJS) -gslibInterface: - cd ../../3rdParty/gslib.github; make -j libgs.a; cd ../../solvers/elliptic - $(cc) $(CFLAGS) -c -o ../../src/gsParallelGatherScatter.o ../../src/gsParallelGatherScatter.c $(paths) - $(cc) $(CFLAGS) -c -o ../../src/gsParallelGatherScatterSetup.o ../../src/gsParallelGatherScatterSetup.c $(paths) - $(cc) $(CFLAGS) -c -o ../../src/xxtCoarseSolve.o ../../src/xxtCoarseSolve.c $(paths) +libogs: + cd ../../libs/gatherScatter; make -j lib; cd ../../solvers/elliptic + +libblas: + cd ../../3rdParty/BlasLapack; make -j lib; cd ../../solvers/elliptic + +libparALMOND: + cd ../parALMOND; make -j lib; cd ../elliptic all: lib ellipticMain @@ -156,11 +148,12 @@ all: lib ellipticMain clean: cd ../parALMOND; make clean; cd ../elliptic cd ../../src; rm *.o; cd ../solvers/elliptic + cd ../../libs/gatherScatter; make clean; cd ../../solvers/elliptic rm src/*.o ellipticMain libelliptic.a realclean: - cd ../../3rdParty/gslib.github; make clean; cd ../../solvers/elliptic cd ../../3rdParty/BlasLapack; make clean; cd ../../solvers/elliptic + cd ../../libs/gatherScatter; make realclean; cd ../../solvers/elliptic cd ../parALMOND; make clean; cd ../elliptic cd ../../src; rm *.o; cd ../solvers/elliptic rm src/*.o ellipticMain libelliptic.a diff --git a/solvers/elliptic/setups/setupTet3D.rc b/solvers/elliptic/setups/setupTet3D.rc index b0e39ad93..565d451be 100644 --- a/solvers/elliptic/setups/setupTet3D.rc +++ b/solvers/elliptic/setups/setupTet3D.rc @@ -5,7 +5,7 @@ data/ellipticHomogeneous3D.h [MESH FILE] -../../meshes/cavityTetH009.msh +../../meshes/cavityTetH05.msh [MESH DIMENSION] 3 diff --git a/solvers/elliptic/setups/setupTri2D.rc b/solvers/elliptic/setups/setupTri2D.rc index 2c02927d1..8c0c2eaa1 100644 --- a/solvers/elliptic/setups/setupTri2D.rc +++ b/solvers/elliptic/setups/setupTri2D.rc @@ -17,7 +17,7 @@ data/ellipticHomogeneous2D.h 4 [THREAD MODEL] -CUDA +Serial [PLATFORM NUMBER] 0 diff --git a/solvers/elliptic/src/ellipticBuildContinuous.c b/solvers/elliptic/src/ellipticBuildContinuous.c index 4f7f2a9c7..5ab8eb5d2 100644 --- a/solvers/elliptic/src/ellipticBuildContinuous.c +++ b/solvers/elliptic/src/ellipticBuildContinuous.c @@ -68,28 +68,38 @@ void ellipticBuildContinuousTri2D(elliptic_t *elliptic, dfloat lambda, nonZero_t mesh2D *mesh = elliptic->mesh; setupAide options = elliptic->options; - /* Build a gather-scatter to assemble the global masked problem */ - dlong Ntotal = mesh->Np*mesh->Nelements; + int rank = mesh->rank; - hlong *globalNumbering = (hlong *) calloc(Ntotal,sizeof(hlong)); - memcpy(globalNumbering,mesh->globalIds,Ntotal*sizeof(hlong)); - for (dlong n=0;nNmasked;n++) - globalNumbering[elliptic->maskIds[n]] = -1; + //use the masked gs handle to define a global ordering + + // number of degrees of freedom on this rank (after gathering) + hlong Ngather = elliptic->ogs->Ngather; + dlong Ntotal = mesh->Np*mesh->Nelements; + + + // create a global numbering system + hlong *globalIds = (hlong *) calloc(Ngather,sizeof(hlong)); + int *owner = (int *) calloc(Ngather,sizeof(int)); - // squeeze node numbering - meshParallelConsecutiveGlobalNumbering(mesh, Ntotal, globalNumbering, mesh->globalOwners, globalStarts); + // every gathered degree of freedom has its own global id + MPI_Allgather(&Ngather, 1, MPI_HLONG, globalStarts+1, 1, MPI_HLONG, mesh->comm); + for(int r=0;rsize;++r) + globalStarts[r+1] = globalStarts[r]+globalStarts[r+1]; - hlong *gatherMaskedBaseIds = (hlong *) calloc(Ntotal,sizeof(hlong)); - for (dlong n=0;ngatherLocalIds[n]; - gatherMaskedBaseIds[n] = globalNumbering[id]; + //use the offsets to set a consecutive global numbering + for (dlong n =0;nogs->Ngather;n++) { + globalIds[n] = n + globalStarts[rank]; + owner[n] = rank; } - //build gather scatter with masked nodes - int verbose = options.compareArgs("VERBOSE", "TRUE") ? 1:0; - *ogs = meshParallelGatherScatterSetup(mesh, Ntotal, - mesh->gatherLocalIds, gatherMaskedBaseIds, - mesh->gatherBaseRanks, mesh->gatherHaloFlags,verbose); + //scatter this numbering to the original nodes + hlong *globalNumbering = (hlong *) calloc(Ntotal,sizeof(hlong)); + int *globalOwners = (int *) calloc(Ntotal,sizeof(int)); + for (dlong n=0;nogs); + ogsScatter(globalOwners, owner, ogsInt, ogsAdd, elliptic->ogs); + + free(globalIds); free(owner); // Build non-zeros of stiffness matrix (unassembled) dlong nnzLocal = mesh->Np*mesh->Np*mesh->Nelements; @@ -114,9 +124,6 @@ void ellipticBuildContinuousTri2D(elliptic_t *elliptic, dfloat lambda, nonZero_t } } - int *mask = (int *) calloc(mesh->Np*mesh->Nelements,sizeof(int)); - for (dlong n=0;nNmasked;n++) mask[elliptic->maskIds[n]] = 1; - if(mesh->rank==0) printf("Building full FEM matrix...");fflush(stdout); //Build unassembed non-zeros @@ -128,9 +135,9 @@ void ellipticBuildContinuousTri2D(elliptic_t *elliptic, dfloat lambda, nonZero_t dfloat J = mesh->ggeo[e*mesh->Nggeo + GWJID]; for (int n=0;nNp;n++) { - if (mask[e*mesh->Np + n]) continue; //skip masked nodes + if (globalNumbering[e*mesh->Np + n]<0) continue; //skip masked nodes for (int m=0;mNp;m++) { - if (mask[e*mesh->Np + m]) continue; //skip masked nodes + if (globalNumbering[e*mesh->Np + m]<0) continue; //skip masked nodes dfloat val = 0.; @@ -145,7 +152,7 @@ void ellipticBuildContinuousTri2D(elliptic_t *elliptic, dfloat lambda, nonZero_t sendNonZeros[cnt].val = val; sendNonZeros[cnt].row = globalNumbering[e*mesh->Np + n]; sendNonZeros[cnt].col = globalNumbering[e*mesh->Np + m]; - sendNonZeros[cnt].ownerRank = mesh->globalOwners[e*mesh->Np + n]; + sendNonZeros[cnt].ownerRank = globalOwners[e*mesh->Np + n]; cnt++; } } @@ -217,7 +224,7 @@ void ellipticBuildContinuousTri2D(elliptic_t *elliptic, dfloat lambda, nonZero_t MPI_Type_free(&MPI_NONZERO_T); free(sendNonZeros); - free(globalNumbering); + free(globalNumbering); free(globalOwners); free(AsendCounts); free(ArecvCounts); @@ -228,7 +235,6 @@ void ellipticBuildContinuousTri2D(elliptic_t *elliptic, dfloat lambda, nonZero_t free(Srs); free(Sss); free(MM ); - free(mask); } @@ -237,29 +243,37 @@ void ellipticBuildContinuousQuad2D(elliptic_t *elliptic, dfloat lambda, nonZero_ mesh2D *mesh = elliptic->mesh; setupAide options = elliptic->options; - /* Build a gather-scatter to assemble the global masked problem */ - dlong Ntotal = mesh->Np*mesh->Nelements; + int rank = mesh->rank; - hlong *globalNumbering = (hlong *) calloc(Ntotal,sizeof(hlong)); - memcpy(globalNumbering,mesh->globalIds,Ntotal*sizeof(hlong)); - for (dlong n=0;nNmasked;n++) - globalNumbering[elliptic->maskIds[n]] = -1; + //use the masked gs handle to define a global ordering + + // number of degrees of freedom on this rank (after gathering) + hlong Ngather = elliptic->ogs->Ngather; + dlong Ntotal = mesh->Np*mesh->Nelements; - // squeeze node numbering - meshParallelConsecutiveGlobalNumbering(mesh, Ntotal, globalNumbering, mesh->globalOwners, globalStarts); + // create a global numbering system + hlong *globalIds = (hlong *) calloc(Ngather,sizeof(hlong)); + int *owner = (int *) calloc(Ngather,sizeof(int)); - hlong *gatherMaskedBaseIds = (hlong *) calloc(Ntotal,sizeof(hlong)); - for (dlong n=0;ngatherLocalIds[n]; - gatherMaskedBaseIds[n] = globalNumbering[id]; + // every gathered degree of freedom has its own global id + MPI_Allgather(&Ngather, 1, MPI_HLONG, globalStarts+1, 1, MPI_HLONG, mesh->comm); + for(int r=0;rsize;++r) + globalStarts[r+1] = globalStarts[r]+globalStarts[r+1]; + + //use the offsets to set a consecutive global numbering + for (dlong n =0;nogs->Ngather;n++) { + globalIds[n] = n + globalStarts[rank]; + owner[n] = rank; } - //build gather scatter with masked nodes - int verbose = options.compareArgs("VERBOSE", "TRUE") ? 1:0; - *ogs = meshParallelGatherScatterSetup(mesh, Ntotal, - mesh->gatherLocalIds, gatherMaskedBaseIds, - mesh->gatherBaseRanks, mesh->gatherHaloFlags,verbose); + //scatter this numbering to the original nodes + hlong *globalNumbering = (hlong *) calloc(Ntotal,sizeof(hlong)); + int *globalOwners = (int *) calloc(Ntotal,sizeof(int)); + for (dlong n=0;nogs); + ogsScatter(globalOwners, owner, ogsInt, ogsAdd, elliptic->ogs); + free(globalIds); free(owner); // 2. Build non-zeros of stiffness matrix (unassembled) dlong nnzLocal = mesh->Np*mesh->Np*mesh->Nelements; @@ -325,7 +339,7 @@ void ellipticBuildContinuousQuad2D(elliptic_t *elliptic, dfloat lambda, nonZero_ sendNonZeros[cnt].val = val; sendNonZeros[cnt].row = globalNumbering[e*mesh->Np + nx+ny*mesh->Nq]; sendNonZeros[cnt].col = globalNumbering[e*mesh->Np + mx+my*mesh->Nq]; - sendNonZeros[cnt].ownerRank = mesh->globalOwners[e*mesh->Np + nx+ny*mesh->Nq]; + sendNonZeros[cnt].ownerRank = globalOwners[e*mesh->Np + nx+ny*mesh->Nq]; cnt++; } } @@ -399,7 +413,7 @@ void ellipticBuildContinuousQuad2D(elliptic_t *elliptic, dfloat lambda, nonZero_ MPI_Type_free(&MPI_NONZERO_T); free(sendNonZeros); - free(globalNumbering); + free(globalNumbering); free(globalOwners); free(AsendCounts); free(ArecvCounts); @@ -412,28 +426,38 @@ void ellipticBuildContinuousTet3D(elliptic_t *elliptic, dfloat lambda, nonZero_t mesh2D *mesh = elliptic->mesh; setupAide options = elliptic->options; - /* Build a gather-scatter to assemble the global masked problem */ - dlong Ntotal = mesh->Np*mesh->Nelements; + int rank = mesh->rank; - hlong *globalNumbering = (hlong *) calloc(Ntotal,sizeof(hlong)); - memcpy(globalNumbering,mesh->globalIds,Ntotal*sizeof(hlong)); - for (dlong n=0;nNmasked;n++) - globalNumbering[elliptic->maskIds[n]] = -1; + //use the masked gs handle to define a global ordering + + // number of degrees of freedom on this rank (after gathering) + hlong Ngather = elliptic->ogs->Ngather; + dlong Ntotal = mesh->Np*mesh->Nelements; - // squeeze node numbering - meshParallelConsecutiveGlobalNumbering(mesh, Ntotal, globalNumbering, mesh->globalOwners, globalStarts); + // create a global numbering system + hlong *globalIds = (hlong *) calloc(Ngather,sizeof(hlong)); + int *owner = (int *) calloc(Ngather,sizeof(int)); - hlong *gatherMaskedBaseIds = (hlong *) calloc(Ntotal,sizeof(hlong)); - for (dlong n=0;ngatherLocalIds[n]; - gatherMaskedBaseIds[n] = globalNumbering[id]; + // every gathered degree of freedom has its own global id + MPI_Allgather(&Ngather, 1, MPI_HLONG, globalStarts+1, 1, MPI_HLONG, mesh->comm); + for(int r=0;rsize;++r) + globalStarts[r+1] = globalStarts[r]+globalStarts[r+1]; + + //use the offsets to set a consecutive global numbering + for (dlong n =0;nogs->Ngather;n++) { + globalIds[n] = n + globalStarts[rank]; + owner[n] = rank; } - //build gather scatter with masked nodes - int verbose = options.compareArgs("VERBOSE", "TRUE") ? 1:0; - *ogs = meshParallelGatherScatterSetup(mesh, Ntotal, - mesh->gatherLocalIds, gatherMaskedBaseIds, - mesh->gatherBaseRanks, mesh->gatherHaloFlags, verbose); + //scatter this numbering to the original nodes + hlong *globalNumbering = (hlong *) calloc(Ntotal,sizeof(hlong)); + int *globalOwners = (int *) calloc(Ntotal,sizeof(int)); + for (dlong n=0;nogs); + ogsScatter(globalOwners, owner, ogsInt, ogsAdd, elliptic->ogs); + + free(globalIds); free(owner); + // Build non-zeros of stiffness matrix (unassembled) dlong nnzLocal = mesh->Np*mesh->Np*mesh->Nelements; @@ -487,7 +511,7 @@ void ellipticBuildContinuousTet3D(elliptic_t *elliptic, dfloat lambda, nonZero_t sendNonZeros[cnt].val = val; sendNonZeros[cnt].row = globalNumbering[e*mesh->Np + n]; sendNonZeros[cnt].col = globalNumbering[e*mesh->Np + m]; - sendNonZeros[cnt].ownerRank = mesh->globalOwners[e*mesh->Np + n]; + sendNonZeros[cnt].ownerRank = globalOwners[e*mesh->Np + n]; cnt++; } } @@ -560,7 +584,7 @@ void ellipticBuildContinuousTet3D(elliptic_t *elliptic, dfloat lambda, nonZero_t MPI_Type_free(&MPI_NONZERO_T); free(sendNonZeros); - free(globalNumbering); + free(globalNumbering); free(globalOwners); free(AsendCounts); free(ArecvCounts); @@ -575,28 +599,37 @@ void ellipticBuildContinuousHex3D(elliptic_t *elliptic, dfloat lambda, nonZero_t mesh2D *mesh = elliptic->mesh; setupAide options = elliptic->options; - /* Build a gather-scatter to assemble the global masked problem */ - dlong Ntotal = mesh->Np*mesh->Nelements; + int rank = mesh->rank; - hlong *globalNumbering = (hlong *) calloc(Ntotal,sizeof(hlong)); - memcpy(globalNumbering,mesh->globalIds,Ntotal*sizeof(hlong)); - for (dlong n=0;nNmasked;n++) - globalNumbering[elliptic->maskIds[n]] = -1; + //use the masked gs handle to define a global ordering + + // number of degrees of freedom on this rank (after gathering) + hlong Ngather = elliptic->ogs->Ngather; + dlong Ntotal = mesh->Np*mesh->Nelements; - // squeeze node numbering - meshParallelConsecutiveGlobalNumbering(mesh, Ntotal, globalNumbering, mesh->globalOwners, globalStarts); + // create a global numbering system + hlong *globalIds = (hlong *) calloc(Ngather,sizeof(hlong)); + int *owner = (int *) calloc(Ngather,sizeof(int)); - hlong *gatherMaskedBaseIds = (hlong *) calloc(Ntotal,sizeof(hlong)); - for (dlong n=0;ngatherLocalIds[n]; - gatherMaskedBaseIds[n] = globalNumbering[id]; + // every gathered degree of freedom has its own global id + MPI_Allgather(&Ngather, 1, MPI_HLONG, globalStarts+1, 1, MPI_HLONG, mesh->comm); + for(int r=0;rsize;++r) + globalStarts[r+1] = globalStarts[r]+globalStarts[r+1]; + + //use the offsets to set a consecutive global numbering + for (dlong n =0;nogs->Ngather;n++) { + globalIds[n] = n + globalStarts[rank]; + owner[n] = rank; } - //build gather scatter with masked nodes - int verbose = options.compareArgs("VERBOSE", "TRUE") ? 1:0; - *ogs = meshParallelGatherScatterSetup(mesh, Ntotal, - mesh->gatherLocalIds, gatherMaskedBaseIds, - mesh->gatherBaseRanks, mesh->gatherHaloFlags,verbose); + //scatter this numbering to the original nodes + hlong *globalNumbering = (hlong *) calloc(Ntotal,sizeof(hlong)); + int *globalOwners = (int *) calloc(Ntotal,sizeof(int)); + for (dlong n=0;nogs); + ogsScatter(globalOwners, owner, ogsInt, ogsAdd, elliptic->ogs); + + free(globalIds); free(owner); // 2. Build non-zeros of stiffness matrix (unassembled) @@ -698,7 +731,7 @@ void ellipticBuildContinuousHex3D(elliptic_t *elliptic, dfloat lambda, nonZero_t sendNonZeros[cnt].val = val; sendNonZeros[cnt].row = globalNumbering[e*mesh->Np + idn]; sendNonZeros[cnt].col = globalNumbering[e*mesh->Np + idm]; - sendNonZeros[cnt].ownerRank = mesh->globalOwners[e*mesh->Np + idn]; + sendNonZeros[cnt].ownerRank = globalOwners[e*mesh->Np + idn]; cnt++; } } @@ -774,7 +807,7 @@ void ellipticBuildContinuousHex3D(elliptic_t *elliptic, dfloat lambda, nonZero_t MPI_Type_free(&MPI_NONZERO_T); free(sendNonZeros); - free(globalNumbering); + free(globalNumbering); free(globalOwners); free(AsendCounts); free(ArecvCounts); diff --git a/solvers/elliptic/src/ellipticBuildJacobi.c b/solvers/elliptic/src/ellipticBuildJacobi.c index a89a2454f..213c00545 100644 --- a/solvers/elliptic/src/ellipticBuildJacobi.c +++ b/solvers/elliptic/src/ellipticBuildJacobi.c @@ -186,7 +186,7 @@ void ellipticBuildJacobi(elliptic_t* elliptic, dfloat lambda, dfloat **invDiagA) } if (options.compareArgs("DISCRETIZATION","CONTINUOUS")) - gsParallelGatherScatter(mesh->hostGsh, diagA, dfloatString, "add"); + ogsGatherScatter(diagA, ogsDfloat, ogsAdd, elliptic->ogs); *invDiagA = (dfloat*) calloc(diagNnum, sizeof(dfloat)); for (dlong n=0;nNelements*mesh->Np;n++) { diff --git a/solvers/elliptic/src/ellipticBuildMultigridLevel.c b/solvers/elliptic/src/ellipticBuildMultigridLevel.c index 1c2e4814b..5137d5a1c 100644 --- a/solvers/elliptic/src/ellipticBuildMultigridLevel.c +++ b/solvers/elliptic/src/ellipticBuildMultigridLevel.c @@ -52,9 +52,7 @@ elliptic_t *ellipticBuildMultigridLevel(elliptic_t *baseElliptic, int Nc, int Nf elliptic->dataStream = baseElliptic->dataStream; elliptic->o_EToB = baseElliptic->o_EToB; - elliptic->o_globalGatherElementList = baseElliptic->o_globalGatherElementList; - elliptic->o_localGatherElementList = baseElliptic->o_localGatherElementList; - + elliptic->o_grad = baseElliptic->o_grad; elliptic->o_EXYZ = baseElliptic->o_EXYZ; @@ -141,11 +139,6 @@ elliptic_t *ellipticBuildMultigridLevel(elliptic_t *baseElliptic, int Nc, int Nf mesh->dataStream = baseElliptic->mesh->dataStream; mesh->haloExtractKernel = baseElliptic->mesh->haloExtractKernel; - mesh->gatherKernel = baseElliptic->mesh->gatherKernel; - mesh->scatterKernel = baseElliptic->mesh->scatterKernel; - mesh->gatherScatterKernel = baseElliptic->mesh->gatherScatterKernel; - mesh->getKernel = baseElliptic->mesh->getKernel; - mesh->putKernel = baseElliptic->mesh->putKernel; mesh->addScalarKernel = baseElliptic->mesh->addScalarKernel; mesh->maskKernel = baseElliptic->mesh->maskKernel; mesh->sumKernel = baseElliptic->mesh->sumKernel; @@ -354,26 +347,26 @@ elliptic_t *ellipticBuildMultigridLevel(elliptic_t *baseElliptic, int Nc, int Nf } mesh->o_Dr = mesh->device.malloc(mesh->Np*mesh->Np*sizeof(dfloat), - mesh->Dr); + mesh->Dr); mesh->o_Ds = mesh->device.malloc(mesh->Np*mesh->Np*sizeof(dfloat), - mesh->Ds); + mesh->Ds); mesh->o_DrT = mesh->device.malloc(mesh->Np*mesh->Np*sizeof(dfloat), - DrT); + DrT); mesh->o_DsT = mesh->device.malloc(mesh->Np*mesh->Np*sizeof(dfloat), - DsT); + DsT); mesh->o_Dmatrices = mesh->device.malloc(2*mesh->Np*mesh->Np*sizeof(dfloat), DrsT); mesh->o_LIFT = mesh->device.malloc(mesh->Np*mesh->Nfaces*mesh->Nfp*sizeof(dfloat), - mesh->LIFT); + mesh->LIFT); mesh->o_LIFTT = mesh->device.malloc(mesh->Np*mesh->Nfaces*mesh->Nfp*sizeof(dfloat), - LIFTT); + LIFTT); @@ -424,13 +417,13 @@ elliptic_t *ellipticBuildMultigridLevel(elliptic_t *baseElliptic, int Nc, int Nf mesh->o_vgeo = mesh->device.malloc(mesh->Nelements*mesh->Nvgeo*mesh->Np*sizeof(dfloat), - mesh->vgeo); + mesh->vgeo); mesh->o_sgeo = mesh->device.malloc(mesh->Nelements*mesh->Nfaces*mesh->Nfp*mesh->Nsgeo*sizeof(dfloat), - mesh->sgeo); + mesh->sgeo); mesh->o_ggeo = mesh->device.malloc(mesh->Nelements*mesh->Np*mesh->Nggeo*sizeof(dfloat), - mesh->ggeo); + mesh->ggeo); mesh->o_LIFTT = baseElliptic->mesh->o_LIFTT; //dummy buffer @@ -524,32 +517,32 @@ elliptic_t *ellipticBuildMultigridLevel(elliptic_t *baseElliptic, int Nc, int Nf } mesh->o_Dr = mesh->device.malloc(mesh->Np*mesh->Np*sizeof(dfloat), - mesh->Dr); + mesh->Dr); mesh->o_Ds = mesh->device.malloc(mesh->Np*mesh->Np*sizeof(dfloat), - mesh->Ds); + mesh->Ds); mesh->o_Dt = mesh->device.malloc(mesh->Np*mesh->Np*sizeof(dfloat), - mesh->Dt); + mesh->Dt); mesh->o_DrT = mesh->device.malloc(mesh->Np*mesh->Np*sizeof(dfloat), - DrT); + DrT); mesh->o_DsT = mesh->device.malloc(mesh->Np*mesh->Np*sizeof(dfloat), - DsT); + DsT); mesh->o_DtT = mesh->device.malloc(mesh->Np*mesh->Np*sizeof(dfloat), - DtT); + DtT); mesh->o_Dmatrices = mesh->device.malloc(3*mesh->Np*mesh->Np*sizeof(dfloat), DrstT); mesh->o_LIFT = mesh->device.malloc(mesh->Np*mesh->Nfaces*mesh->Nfp*sizeof(dfloat), - mesh->LIFT); + mesh->LIFT); mesh->o_LIFTT = mesh->device.malloc(mesh->Np*mesh->Nfaces*mesh->Nfp*sizeof(dfloat), - LIFTT); + LIFTT); mesh->o_SrrT = mesh->device.malloc(mesh->Np*mesh->Np*sizeof(dfloat), SrrT); mesh->o_SrsT = mesh->device.malloc(mesh->Np*mesh->Np*sizeof(dfloat), SrsT); @@ -587,21 +580,21 @@ elliptic_t *ellipticBuildMultigridLevel(elliptic_t *baseElliptic, int Nc, int Nf mesh->o_vgeo = mesh->device.malloc(mesh->Nelements*mesh->Nvgeo*mesh->Np*sizeof(dfloat), - mesh->vgeo); + mesh->vgeo); mesh->o_sgeo = mesh->device.malloc(mesh->Nelements*mesh->Nfaces*mesh->Nfp*mesh->Nsgeo*sizeof(dfloat), - mesh->sgeo); + mesh->sgeo); mesh->o_ggeo = mesh->device.malloc(mesh->Nelements*mesh->Np*mesh->Nggeo*sizeof(dfloat), - mesh->ggeo); + mesh->ggeo); mesh->o_vmapM = mesh->device.malloc(mesh->Nelements*mesh->Nfp*mesh->Nfaces*sizeof(dlong), - mesh->vmapM); + mesh->vmapM); mesh->o_vmapP = mesh->device.malloc(mesh->Nelements*mesh->Nfp*mesh->Nfaces*sizeof(dlong), - mesh->vmapP); + mesh->vmapP); mesh->LIFT = baseElliptic->mesh->LIFT; //dummy buffer mesh->o_LIFTT = baseElliptic->mesh->o_LIFTT; //dummy buffer @@ -618,10 +611,10 @@ elliptic_t *ellipticBuildMultigridLevel(elliptic_t *baseElliptic, int Nc, int Nf mesh->vgeo = (dfloat*) realloc(mesh->vgeo, (Nlocal+Nhalo)*mesh->Nvgeo*sizeof(dfloat)); meshHaloExchange(mesh, - mesh->Nvgeo*mesh->Np*sizeof(dfloat), - mesh->vgeo, - vgeoSendBuffer, - mesh->vgeo + Nlocal*mesh->Nvgeo); + mesh->Nvgeo*mesh->Np*sizeof(dfloat), + mesh->vgeo, + vgeoSendBuffer, + mesh->vgeo + Nlocal*mesh->Nvgeo); mesh->o_vgeo = mesh->device.malloc((Nlocal+Nhalo)*mesh->Nvgeo*sizeof(dfloat), mesh->vgeo); @@ -630,15 +623,15 @@ elliptic_t *ellipticBuildMultigridLevel(elliptic_t *baseElliptic, int Nc, int Nf mesh->o_MM = mesh->device.malloc(mesh->Np*mesh->Np*sizeof(dfloat), - mesh->MM); + mesh->MM); mesh->o_vmapM = mesh->device.malloc(mesh->Nelements*mesh->Nfp*mesh->Nfaces*sizeof(int), - mesh->vmapM); + mesh->vmapM); mesh->o_vmapP = mesh->device.malloc(mesh->Nelements*mesh->Nfp*mesh->Nfaces*sizeof(int), - mesh->vmapP); + mesh->vmapP); //set the normalization constant for the allNeumann Poisson problem on this coarse mesh @@ -651,6 +644,58 @@ elliptic_t *ellipticBuildMultigridLevel(elliptic_t *baseElliptic, int Nc, int Nf elliptic->o_tmp = mesh->device.malloc(Nblock*sizeof(dfloat), elliptic->tmp); elliptic->o_tmp2 = mesh->device.malloc(Nblock2*sizeof(dfloat), elliptic->tmp); + + //setup an unmasked gs handle + int verbose = options.compareArgs("VERBOSE","TRUE") ? 1:0; + meshParallelGatherScatterSetup(mesh, Ntotal, mesh->globalIds, mesh->comm, verbose); + + //make a node-wise bc flag using the gsop (prioritize Dirichlet boundaries over Neumann) + elliptic->mapB = (int *) calloc(mesh->Nelements*mesh->Np,sizeof(int)); + for (dlong e=0;eNelements;e++) { + for (int n=0;nNp;n++) elliptic->mapB[n+e*mesh->Np] = 1E9; + for (int f=0;fNfaces;f++) { + int bc = mesh->EToB[f+e*mesh->Nfaces]; + if (bc>0) { + for (int n=0;nNfp;n++) { + int BCFlag = elliptic->BCType[bc]; + int fid = mesh->faceNodes[n+f*mesh->Nfp]; + elliptic->mapB[fid+e*mesh->Np] = mymin(BCFlag,elliptic->mapB[fid+e*mesh->Np]); + } + } + } + } + ogsGatherScatter(elliptic->mapB, ogsInt, ogsMin, mesh->ogs); + + //use the bc flags to find masked ids + elliptic->Nmasked = 0; + for (dlong n=0;nNelements*mesh->Np;n++) { + if (elliptic->mapB[n] == 1E9) { + elliptic->mapB[n] = 0.; + } else if (elliptic->mapB[n] == 1) { //Dirichlet boundary + elliptic->Nmasked++; + } + } + elliptic->o_mapB = mesh->device.malloc(mesh->Nelements*mesh->Np*sizeof(int), elliptic->mapB); + + elliptic->maskIds = (dlong *) calloc(elliptic->Nmasked, sizeof(dlong)); + elliptic->Nmasked =0; //reset + for (dlong n=0;nNelements*mesh->Np;n++) { + if (elliptic->mapB[n] == 1) elliptic->maskIds[elliptic->Nmasked++] = n; + } + if (elliptic->Nmasked) elliptic->o_maskIds = mesh->device.malloc(elliptic->Nmasked*sizeof(dlong), elliptic->maskIds); + + //make a masked version of the global id numbering + mesh->maskedGlobalIds = (hlong *) calloc(Ntotal,sizeof(hlong)); + memcpy(mesh->maskedGlobalIds, mesh->globalIds, Ntotal*sizeof(hlong)); + for (dlong n=0;nNmasked;n++) + mesh->maskedGlobalIds[elliptic->maskIds[n]] = 0; + + //use the masked ids to make another gs handle + elliptic->ogs = ogsSetup(Ntotal, mesh->maskedGlobalIds, mesh->comm, verbose, mesh->device); + elliptic->o_invDegree = elliptic->ogs->o_invDegree; + + + // info for kernel construction occa::properties kernelInfo; kernelInfo["defines"].asObject(); @@ -806,14 +851,14 @@ elliptic_t *ellipticBuildMultigridLevel(elliptic_t *baseElliptic, int Nc, int Nf // check for trilinear if(elliptic->elementType!=HEXAHEDRA){ - sprintf(kernelName, "ellipticPartialAx%s", suffix); + sprintf(kernelName, "ellipticPartialAx%s", suffix); } else{ - if(elliptic->options.compareArgs("ELEMENT MAP", "TRILINEAR")){ - sprintf(kernelName, "ellipticPartialAxTrilinear%s", suffix); - }else{ - sprintf(kernelName, "ellipticPartialAx%s", suffix); - } + if(elliptic->options.compareArgs("ELEMENT MAP", "TRILINEAR")){ + sprintf(kernelName, "ellipticPartialAxTrilinear%s", suffix); + }else{ + sprintf(kernelName, "ellipticPartialAx%s", suffix); + } } //sprintf(kernelName, "ellipticPartialAx%s", suffix); @@ -885,21 +930,21 @@ elliptic_t *ellipticBuildMultigridLevel(elliptic_t *baseElliptic, int Nc, int Nf int NpFine, NpCoarse; switch(elliptic->elementType){ case TRIANGLES: - NpFine = (Nf+1)*(Nf+2)/2; - NpCoarse = (Nc+1)*(Nc+2)/2; - break; + NpFine = (Nf+1)*(Nf+2)/2; + NpCoarse = (Nc+1)*(Nc+2)/2; + break; case QUADRILATERALS: - NpFine = (Nf+1)*(Nf+1); - NpCoarse = (Nc+1)*(Nc+1); - break; + NpFine = (Nf+1)*(Nf+1); + NpCoarse = (Nc+1)*(Nc+1); + break; case TETRAHEDRA: - NpFine = (Nf+1)*(Nf+2)*(Nf+3)/6; - NpCoarse = (Nc+1)*(Nc+2)*(Nc+3)/6; - break; + NpFine = (Nf+1)*(Nf+2)*(Nf+3)/6; + NpCoarse = (Nc+1)*(Nc+2)*(Nc+3)/6; + break; case HEXAHEDRA: - NpFine = (Nf+1)*(Nf+1)*(Nf+1); - NpCoarse = (Nc+1)*(Nc+1)*(Nc+1); - break; + NpFine = (Nf+1)*(Nf+1)*(Nf+1); + NpCoarse = (Nc+1)*(Nc+1)*(Nc+1); + break; } kernelInfo["defines/" "p_NpFine"]= NpFine; kernelInfo["defines/" "p_NpCoarse"]= NpCoarse; @@ -920,64 +965,21 @@ elliptic_t *ellipticBuildMultigridLevel(elliptic_t *baseElliptic, int Nc, int Nf MPI_Barrier(mesh->comm); } - //on host gather-scatter - int verbose = options.compareArgs("VERBOSE", "TRUE") ? 1:0; - mesh->hostGsh = gsParallelGatherScatterSetup(mesh->comm, mesh->Nelements*mesh->Np, mesh->globalIds, verbose); - - // set up separate gather scatter infrastructure for halo and non halo nodes - ellipticParallelGatherScatterSetup(elliptic); - - //make a node-wise bc flag using the gsop (prioritize Dirichlet boundaries over Neumann) - elliptic->mapB = (int *) calloc(mesh->Nelements*mesh->Np,sizeof(int)); - for (dlong e=0;eNelements;e++) { - for (int n=0;nNp;n++) elliptic->mapB[n+e*mesh->Np] = 1E9; - for (int f=0;fNfaces;f++) { - int bc = mesh->EToB[f+e*mesh->Nfaces]; - if (bc>0) { - for (int n=0;nNfp;n++) { - int BCFlag = elliptic->BCType[bc]; - int fid = mesh->faceNodes[n+f*mesh->Nfp]; - elliptic->mapB[fid+e*mesh->Np] = mymin(BCFlag,elliptic->mapB[fid+e*mesh->Np]); - } - } - } - } - gsParallelGatherScatter(mesh->hostGsh, elliptic->mapB, "int", "min"); - - //use the bc flags to find masked ids - elliptic->Nmasked = 0; - for (dlong n=0;nNelements*mesh->Np;n++) { - if (elliptic->mapB[n] == 1E9) { - elliptic->mapB[n] = 0.; - } else if (elliptic->mapB[n] == 1) { //Dirichlet boundary - elliptic->Nmasked++; - } - } - elliptic->o_mapB = mesh->device.malloc(mesh->Nelements*mesh->Np*sizeof(int), elliptic->mapB); - - elliptic->maskIds = (dlong *) calloc(elliptic->Nmasked, sizeof(dlong)); - elliptic->Nmasked =0; //reset - for (dlong n=0;nNelements*mesh->Np;n++) { - if (elliptic->mapB[n] == 1) elliptic->maskIds[elliptic->Nmasked++] = n; - } - if (elliptic->Nmasked) elliptic->o_maskIds = mesh->device.malloc(elliptic->Nmasked*sizeof(dlong), elliptic->maskIds); - - if(elliptic->elementType==HEXAHEDRA){ if(options.compareArgs("DISCRETIZATION","CONTINUOUS")){ if(options.compareArgs("ELEMENT MAP", "TRILINEAR")){ - - // pack gllz, gllw, and elementwise EXYZ - dfloat *gllzw = (dfloat*) calloc(2*mesh->Nq, sizeof(dfloat)); - - int sk = 0; - for(int n=0;nNq;++n) - gllzw[sk++] = mesh->gllz[n]; - for(int n=0;nNq;++n) - gllzw[sk++] = mesh->gllw[n]; - - elliptic->o_gllzw = mesh->device.malloc(2*mesh->Nq*sizeof(dfloat), gllzw); - free(gllzw); + + // pack gllz, gllw, and elementwise EXYZ + dfloat *gllzw = (dfloat*) calloc(2*mesh->Nq, sizeof(dfloat)); + + int sk = 0; + for(int n=0;nNq;++n) + gllzw[sk++] = mesh->gllz[n]; + for(int n=0;nNq;++n) + gllzw[sk++] = mesh->gllw[n]; + + elliptic->o_gllzw = mesh->device.malloc(2*mesh->Nq*sizeof(dfloat), gllzw); + free(gllzw); } } } diff --git a/solvers/elliptic/src/ellipticMain.c b/solvers/elliptic/src/ellipticMain.c index 313f1f6a1..64d0f5be0 100644 --- a/solvers/elliptic/src/ellipticMain.c +++ b/solvers/elliptic/src/ellipticMain.c @@ -72,10 +72,10 @@ int main(int argc, char **argv){ // set up occa::properties kernelInfo; - kernelInfo["defines"].asObject(); - kernelInfo["includes"].asArray(); - kernelInfo["header"].asArray(); - kernelInfo["flags"].asObject(); + kernelInfo["defines"].asObject(); + kernelInfo["includes"].asArray(); + kernelInfo["header"].asArray(); + kernelInfo["flags"].asObject(); elliptic_t *elliptic = ellipticSetup(mesh, lambda, kernelInfo, options); @@ -90,21 +90,21 @@ int main(int argc, char **argv){ for(int it=0;ito_x, elliptic->o_Ax, dfloatString); // standard precision + ellipticOperator(elliptic, lambda, elliptic->o_x, elliptic->o_Ax, dfloatString); // standard precision if(options.compareArgs("BENCHMARK", "BK5")){ - if(!options.compareArgs("ELEMENT MAP", "TRILINEAR")){ - elliptic->partialAxKernel(elliptic->NlocalGatherElements, - elliptic->o_localGatherElementList, - mesh->o_ggeo, mesh->o_Dmatrices, mesh->o_Smatrices, mesh->o_MM, - lambda, elliptic->o_x, elliptic->o_Ax); - } - else{ - elliptic->partialAxKernel(elliptic->NlocalGatherElements, - elliptic->o_localGatherElementList, - elliptic->o_EXYZ, elliptic->o_gllzw, mesh->o_Dmatrices, mesh->o_Smatrices, mesh->o_MM, - lambda, elliptic->o_x, elliptic->o_Ax); - } + if(!options.compareArgs("ELEMENT MAP", "TRILINEAR")){ + elliptic->partialAxKernel(mesh->NlocalGatherElements, + mesh->o_localGatherElementList, + mesh->o_ggeo, mesh->o_Dmatrices, mesh->o_Smatrices, mesh->o_MM, + lambda, elliptic->o_x, elliptic->o_Ax); + } + else{ + elliptic->partialAxKernel(mesh->NlocalGatherElements, + mesh->o_localGatherElementList, + elliptic->o_EXYZ, elliptic->o_gllzw, mesh->o_Dmatrices, mesh->o_Smatrices, mesh->o_MM, + lambda, elliptic->o_x, elliptic->o_Ax); + } } } @@ -117,13 +117,13 @@ int main(int argc, char **argv){ printf("%d, %d, %g, %d, %g, %g; \%\%elemental: N, dofs, elapsed, dummy, time per node, nodes/time %s\n", - mesh->N, - elliptic->NlocalGatherElements*mesh->Np, - 0, - elapsedAx, - elapsedAx/(mesh->Np*mesh->Nelements), - mesh->Nelements*mesh->Np/elapsedAx, - options.getArgs("DISCRETIZATION").c_str()); + mesh->N, + mesh->NlocalGatherElements*mesh->Np, + 0, + elapsedAx, + elapsedAx/(mesh->Np*mesh->Nelements), + mesh->Nelements*mesh->Np/elapsedAx, + options.getArgs("DISCRETIZATION").c_str()); } else{ @@ -141,23 +141,23 @@ int main(int argc, char **argv){ double elapsed = mesh->device.timeBetween(startTag, stopTag); printf("%d, %d, %g, %d, %g, %g; \%\%global: N, dofs, elapsed, iterations, time per node, nodes*iterations/time %s\n", - mesh->N, - mesh->Nelements*mesh->Np, - elapsed, - it, - elapsed/(mesh->Np*mesh->Nelements), - mesh->Nelements*(it*mesh->Np/elapsed), - options.getArgs("PRECONDITIONER").c_str()); + mesh->N, + mesh->Nelements*mesh->Np, + elapsed, + it, + elapsed/(mesh->Np*mesh->Nelements), + mesh->Nelements*(it*mesh->Np/elapsed), + options.getArgs("PRECONDITIONER").c_str()); if(options.compareArgs("DISCRETIZATION","CONTINUOUS")){ dfloat zero = 0.; elliptic->addBCKernel(mesh->Nelements, - zero, - mesh->o_x, - mesh->o_y, - mesh->o_z, - elliptic->o_mapB, - elliptic->o_x); + zero, + mesh->o_x, + mesh->o_y, + mesh->o_z, + elliptic->o_mapB, + elliptic->o_x); } // copy solution from DEVICE to HOST @@ -169,19 +169,19 @@ int main(int argc, char **argv){ dfloat maxError = 0; for(dlong e=0;eNelements;++e){ for(int n=0;nNp;++n){ - dlong id = e*mesh->Np+n; - dfloat xn = mesh->x[id]; - dfloat yn = mesh->y[id]; - dfloat zn = mesh->z[id]; + dlong id = e*mesh->Np+n; + dfloat xn = mesh->x[id]; + dfloat yn = mesh->y[id]; + dfloat zn = mesh->z[id]; - dfloat exact; - if (elliptic->dim==2) - exact = sin(M_PI*xn)*sin(M_PI*yn); - else - exact = cos(M_PI*xn)*cos(M_PI*yn)*cos(M_PI*zn); - dfloat error = fabs(exact-mesh->q[id]); - - maxError = mymax(maxError, error); + dfloat exact; + if (elliptic->dim==2) + exact = sin(M_PI*xn)*sin(M_PI*yn); + else + exact = cos(M_PI*xn)*cos(M_PI*yn)*cos(M_PI*zn); + dfloat error = fabs(exact-mesh->q[id]); + + maxError = mymax(maxError, error); } } diff --git a/solvers/elliptic/src/ellipticMultiGridSetup.c b/solvers/elliptic/src/ellipticMultiGridSetup.c index f8fcd5508..72db6933c 100644 --- a/solvers/elliptic/src/ellipticMultiGridSetup.c +++ b/solvers/elliptic/src/ellipticMultiGridSetup.c @@ -51,7 +51,7 @@ void ellipticMultigridCoarsen(void **args, occa::memory &o_x, occa::memory &o_Rx precon->coarsenKernel(mesh->Nelements, o_R, o_x, o_Rx); if (options.compareArgs("DISCRETIZATION","CONTINUOUS")) { - ellipticParallelGatherScatter(mesh, mesh->ogs, o_Rx, dfloatString, "add"); + ogsGatherScatter(o_Rx, ogsDfloat, ogsAdd, mesh->ogs); if (elliptic->Nmasked) mesh->maskKernel(elliptic->Nmasked, elliptic->o_maskIds, o_Rx); } } @@ -75,7 +75,7 @@ void ellipticGather(void **args, occa::memory &o_x, occa::memory &o_Gx) { mesh_t *mesh = elliptic->mesh; setupAide options = elliptic->options; - meshParallelGather(mesh, ogs, o_x, o_Gx); + ogsGather(o_Gx, o_x, ogsDfloat, ogsAdd, ogs); elliptic->dotMultiplyKernel(ogs->Ngather, ogs->o_gatherInvDegree, o_Gx, o_Gx); } @@ -88,7 +88,7 @@ void ellipticScatter(void **args, occa::memory &o_x, occa::memory &o_Sx) { mesh_t *mesh = elliptic->mesh; setupAide options = elliptic->options; - meshParallelScatter(mesh, ogs, o_x, o_Sx); + ogsScatter(o_Sx, o_x, ogsDfloat, ogsAdd, ogs); } void buildCoarsenerTriTet(elliptic_t* elliptic, mesh_t **meshLevels, int Nf, int Nc); @@ -337,7 +337,7 @@ void ellipticMultiGridSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambd coarseLevel->gatherArgs = (void **) calloc(3,sizeof(void*)); coarseLevel->gatherArgs[0] = (void *) ellipticL; - coarseLevel->gatherArgs[1] = (void *) coarseogs; + coarseLevel->gatherArgs[1] = (void *) ellipticL->ogs; coarseLevel->gatherArgs[2] = (void *) &(coarseLevel->o_Sx); coarseLevel->scatterArgs = coarseLevel->gatherArgs; diff --git a/solvers/elliptic/src/ellipticOperator.c b/solvers/elliptic/src/ellipticOperator.c index 0f1c2a2c2..d119e581b 100644 --- a/solvers/elliptic/src/ellipticOperator.c +++ b/solvers/elliptic/src/ellipticOperator.c @@ -42,69 +42,38 @@ void ellipticOperator(elliptic_t *elliptic, dfloat lambda, occa::memory &o_q, oc dlong Nblock = elliptic->Nblock; dfloat *tmp = elliptic->tmp; occa::memory &o_tmp = elliptic->o_tmp; - - int one = 1; - dlong dOne = 1; + if(options.compareArgs("DISCRETIZATION", "CONTINUOUS")){ - ogs_t *ogs = elliptic->mesh->ogs; + ogs_t *ogs = elliptic->ogs; int mapType = (elliptic->elementType==HEXAHEDRA && - options.compareArgs("ELEMENT MAP", "TRILINEAR")) ? 1:0; + options.compareArgs("ELEMENT MAP", "TRILINEAR")) ? 1:0; occa::kernel &partialAxKernel = (strstr(precision, "float")) ? elliptic->partialFloatAxKernel : elliptic->partialAxKernel; - if(elliptic->NglobalGatherElements) { + if(mesh->NglobalGatherElements) { if(mapType==0) - partialAxKernel(elliptic->NglobalGatherElements, elliptic->o_globalGatherElementList, - mesh->o_ggeo, mesh->o_Dmatrices, mesh->o_Smatrices, mesh->o_MM, lambda, o_q, o_Aq); + partialAxKernel(mesh->NglobalGatherElements, mesh->o_globalGatherElementList, + mesh->o_ggeo, mesh->o_Dmatrices, mesh->o_Smatrices, mesh->o_MM, lambda, o_q, o_Aq); else - partialAxKernel(elliptic->NglobalGatherElements, elliptic->o_globalGatherElementList, - elliptic->o_EXYZ, elliptic->o_gllzw, mesh->o_Dmatrices, mesh->o_Smatrices, mesh->o_MM, lambda, o_q, o_Aq); + partialAxKernel(mesh->NglobalGatherElements, mesh->o_globalGatherElementList, + elliptic->o_EXYZ, elliptic->o_gllzw, mesh->o_Dmatrices, mesh->o_Smatrices, mesh->o_MM, lambda, o_q, o_Aq); } - if(ogs->NhaloGather) { - mesh->gatherKernel(ogs->NhaloGather, ogs->o_haloGatherOffsets, ogs->o_haloGatherLocalIds, one, dOne, o_Aq, ogs->o_haloGatherTmp); - - mesh->device.finish(); - mesh->device.setStream(elliptic->dataStream); - - ogs->o_haloGatherTmp.copyTo(ogs->haloGatherTmp,"async: true"); - mesh->device.setStream(elliptic->defaultStream); - } + ogsGatherScatterStart(o_Aq, ogsDfloat, ogsAdd, ogs); - if(elliptic->NlocalGatherElements){ + if(mesh->NlocalGatherElements){ if(mapType==0) - partialAxKernel(elliptic->NlocalGatherElements, elliptic->o_localGatherElementList, - mesh->o_ggeo, mesh->o_Dmatrices, mesh->o_Smatrices, mesh->o_MM, lambda, o_q, o_Aq); + partialAxKernel(mesh->NlocalGatherElements, mesh->o_localGatherElementList, + mesh->o_ggeo, mesh->o_Dmatrices, mesh->o_Smatrices, mesh->o_MM, lambda, o_q, o_Aq); else - partialAxKernel(elliptic->NlocalGatherElements, elliptic->o_localGatherElementList, - elliptic->o_EXYZ, elliptic->o_gllzw, mesh->o_Dmatrices, mesh->o_Smatrices, mesh->o_MM, lambda, o_q, o_Aq); + partialAxKernel(mesh->NlocalGatherElements, mesh->o_localGatherElementList, + elliptic->o_EXYZ, elliptic->o_gllzw, mesh->o_Dmatrices, mesh->o_Smatrices, mesh->o_MM, lambda, o_q, o_Aq); } // finalize gather using local and global contributions - if(ogs->NnonHaloGather) - mesh->gatherScatterKernel(ogs->NnonHaloGather, ogs->o_nonHaloGatherOffsets, ogs->o_nonHaloGatherLocalIds, one, dOne, o_Aq); - - // C0 halo gather-scatter (on data stream) - if(ogs->NhaloGather) { - mesh->device.setStream(elliptic->dataStream); - mesh->device.finish(); - - // MPI based gather scatter using libgs - gsParallelGatherScatter(ogs->haloGsh, ogs->haloGatherTmp, dfloatString, "add"); - - // copy totally gather halo data back from HOST to DEVICE - ogs->o_haloGatherTmp.copyFrom(ogs->haloGatherTmp,"async: true"); - - mesh->device.finish(); - - mesh->device.setStream(elliptic->defaultStream); - - // do scatter back to local nodes - mesh->scatterKernel(ogs->NhaloGather, ogs->o_haloGatherOffsets, ogs->o_haloGatherLocalIds, one, dOne, ogs->o_haloGatherTmp, o_Aq); - - } + ogsGatherScatterFinish(o_Aq, ogsDfloat, ogsAdd, ogs); if(elliptic->allNeumann) { // mesh->sumKernel(mesh->Nelements*mesh->Np, o_q, o_tmp); diff --git a/solvers/elliptic/src/ellipticParallelGatherScatterSetup.c b/solvers/elliptic/src/ellipticParallelGatherScatterSetup.c deleted file mode 100644 index 787d37ca9..000000000 --- a/solvers/elliptic/src/ellipticParallelGatherScatterSetup.c +++ /dev/null @@ -1,92 +0,0 @@ -/* - -The MIT License (MIT) - -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -*/ - -#include "elliptic.h" - -// assume nodes locally sorted by rank then global index -// assume gather and scatter are the same sets -void ellipticParallelGatherScatterSetup(elliptic_t* elliptic){ - - mesh_t *mesh = elliptic->mesh; - - // setup occa gather scatter - int verbose = elliptic->options.compareArgs("VERBOSE","TRUE") ? 1:0; - mesh->ogs = meshParallelGatherScatterSetup(mesh,mesh->Np*mesh->Nelements, - mesh->gatherLocalIds, - mesh->gatherBaseIds, - mesh->gatherBaseRanks, - mesh->gatherHaloFlags, - verbose); - elliptic->o_invDegree = mesh->ogs->o_invDegree; - - // count elements that contribute to global C0 gather-scatter - dlong globalCount = 0; - dlong localCount = 0; - for(dlong e=0;eNelements;++e){ - int isHalo = 0; - for(int n=0;nNp;++n){ - if(mesh->globalHaloFlags[e*mesh->Np+n]>0){ - isHalo = 1; - break; - } - } - globalCount += isHalo; - localCount += 1-isHalo; - } - - dlong *globalGatherElementList = (dlong*) calloc(globalCount, sizeof(dlong)); - dlong *localGatherElementList = (dlong*) calloc(localCount, sizeof(dlong)); - - globalCount = 0; - localCount = 0; - - for(dlong e=0;eNelements;++e){ - int isHalo = 0; - for(int n=0;nNp;++n){ - if(mesh->globalHaloFlags[e*mesh->Np+n]>0){ - isHalo = 1; - break; - } - } - if(isHalo){ - globalGatherElementList[globalCount++] = e; - } else{ - localGatherElementList[localCount++] = e; - } - } - //printf("local = %d, global = %d\n", localCount, globalCount); - - elliptic->NglobalGatherElements = globalCount; - elliptic->NlocalGatherElements = localCount; - - if(globalCount) - elliptic->o_globalGatherElementList = - mesh->device.malloc(globalCount*sizeof(dlong), globalGatherElementList); - - if(localCount) - elliptic->o_localGatherElementList = - mesh->device.malloc(localCount*sizeof(dlong), localGatherElementList); -} diff --git a/solvers/elliptic/src/ellipticPreconditioner.c b/solvers/elliptic/src/ellipticPreconditioner.c index 55fa25ec3..c824aa7ec 100644 --- a/solvers/elliptic/src/ellipticPreconditioner.c +++ b/solvers/elliptic/src/ellipticPreconditioner.c @@ -49,55 +49,24 @@ void ellipticPreconditioner(elliptic_t *elliptic, dfloat lambda, precon->blockJacobiKernel(mesh->Nelements, invLambda, mesh->o_vgeo, precon->o_invMM, o_r, o_z); occaTimerToc(mesh->device,"blockJacobiKernel"); } else if (options.compareArgs("DISCRETIZATION", "CONTINUOUS")) { - ogs_t *ogs = elliptic->mesh->ogs; - int one = 1; - dlong dOne = 1; + ogs_t *ogs = elliptic->ogs; elliptic->dotMultiplyKernel(mesh->Nelements*mesh->Np, ogs->o_invDegree, o_r, elliptic->o_rtmp); - //pre-mask - //if (elliptic->Nmasked) mesh->maskKernel(elliptic->Nmasked, elliptic->o_maskIds, o_r); - - if(ogs->NhaloGather) { - precon->partialblockJacobiKernel(elliptic->NglobalGatherElements, - elliptic->o_globalGatherElementList, - invLambda, mesh->o_vgeo, precon->o_invMM, elliptic->o_rtmp, o_z); - mesh->device.finish(); - mesh->device.setStream(elliptic->dataStream); - mesh->gatherKernel(ogs->NhaloGather, ogs->o_haloGatherOffsets, ogs->o_haloGatherLocalIds, one, dOne, o_z, ogs->o_haloGatherTmp); - ogs->o_haloGatherTmp.copyTo(ogs->haloGatherTmp,"async: true"); - mesh->device.setStream(elliptic->defaultStream); - } - if(elliptic->NlocalGatherElements){ - precon->partialblockJacobiKernel(elliptic->NlocalGatherElements, - elliptic->o_localGatherElementList, + if(mesh->NglobalGatherElements) + precon->partialblockJacobiKernel(mesh->NglobalGatherElements, + mesh->o_globalGatherElementList, invLambda, mesh->o_vgeo, precon->o_invMM, elliptic->o_rtmp, o_z); - } - - // finalize gather using local and global contributions - if(ogs->NnonHaloGather) mesh->gatherScatterKernel(ogs->NnonHaloGather, ogs->o_nonHaloGatherOffsets, ogs->o_nonHaloGatherLocalIds, one, dOne, o_z); - - // C0 halo gather-scatter (on data stream) - if(ogs->NhaloGather) { - mesh->device.setStream(elliptic->dataStream); - mesh->device.finish(); - // MPI based gather scatter using libgs - gsParallelGatherScatter(ogs->haloGsh, ogs->haloGatherTmp, dfloatString, "add"); - - // copy totally gather halo data back from HOST to DEVICE - ogs->o_haloGatherTmp.copyFrom(ogs->haloGatherTmp,"async: true"); - - // do scatter back to local nodes - mesh->scatterKernel(ogs->NhaloGather, ogs->o_haloGatherOffsets, ogs->o_haloGatherLocalIds, one, dOne, ogs->o_haloGatherTmp, o_z); - mesh->device.setStream(elliptic->defaultStream); - } - - mesh->device.finish(); - mesh->device.setStream(elliptic->dataStream); - mesh->device.finish(); - mesh->device.setStream(elliptic->defaultStream); + ogsGatherScatterStart(o_z, ogsDfloat, ogsAdd, ogs); + if(mesh->NlocalGatherElements) + precon->partialblockJacobiKernel(mesh->NlocalGatherElements, + mesh->o_localGatherElementList, + invLambda, mesh->o_vgeo, precon->o_invMM, elliptic->o_rtmp, o_z); + + ogsGatherScatterFinish(o_z, ogsDfloat, ogsAdd, ogs); + elliptic->dotMultiplyKernel(mesh->Nelements*mesh->Np, ogs->o_invDegree, o_z, o_z); //post-mask @@ -110,15 +79,15 @@ void ellipticPreconditioner(elliptic_t *elliptic, dfloat lambda, o_z.copyFrom(o_r); elliptic->dotMultiplyKernel(mesh->Nelements*mesh->Np, elliptic->o_invDegree, o_z, o_z); precon->SEMFEMInterpKernel(mesh->Nelements,mesh->o_SEMFEMAnterp,o_z,precon->o_rFEM); - meshParallelGather(mesh, precon->FEMogs, precon->o_rFEM, precon->o_GrFEM); + ogsGather(precon->o_GrFEM, precon->o_rFEM, ogsDfloat, ogsAdd, precon->FEMogs); occaTimerTic(mesh->device,"parALMOND"); parAlmondPrecon(precon->parAlmond, precon->o_GzFEM, precon->o_GrFEM); occaTimerToc(mesh->device,"parALMOND"); - meshParallelScatter(mesh, precon->FEMogs, precon->o_GzFEM, precon->o_zFEM); + ogsScatter(precon->o_zFEM, precon->o_GzFEM, ogsDfloat, ogsAdd, precon->FEMogs); precon->SEMFEMAnterpKernel(mesh->Nelements,mesh->o_SEMFEMAnterp,precon->o_zFEM,o_z); elliptic->dotMultiplyKernel(mesh->Nelements*mesh->Np, elliptic->o_invDegree, o_z, o_z); - ellipticParallelGatherScatter(mesh, mesh->ogs, o_z, dfloatString, "add"); + ogsGatherScatter(o_z, ogsDfloat, ogsAdd, elliptic->ogs); if (elliptic->Nmasked) mesh->maskKernel(elliptic->Nmasked, elliptic->o_maskIds, o_z); } else { occaTimerTic(mesh->device,"parALMOND"); diff --git a/solvers/elliptic/src/ellipticSEMFEMSetup.c b/solvers/elliptic/src/ellipticSEMFEMSetup.c index a7973570c..59a8cd9a7 100644 --- a/solvers/elliptic/src/ellipticSEMFEMSetup.c +++ b/solvers/elliptic/src/ellipticSEMFEMSetup.c @@ -80,10 +80,10 @@ int parallelCompareFEMvertsLocalId(const void *a, const void *b){ int parallelCompareRowColumn(const void *a, const void *b); -void BuildFEMMatrixTri2D (mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, dlong *localIds, hlong* globalNumbering,dlong *cnt, nonZero_t *A); -void BuildFEMMatrixQuad2D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, dlong *localIds, hlong* globalNumbering,dlong *cnt, nonZero_t *A); -void BuildFEMMatrixTet3D (mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, dlong *localIds, hlong* globalNumbering,dlong *cnt, nonZero_t *A); -void BuildFEMMatrixHex3D (mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, dlong *localIds, hlong* globalNumbering,dlong *cnt, nonZero_t *A); +void BuildFEMMatrixTri2D (mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, dlong *localIds, hlong* globalNumbering,int *globalOwners,dlong *cnt, nonZero_t *A); +void BuildFEMMatrixQuad2D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, dlong *localIds, hlong* globalNumbering,int *globalOwners,dlong *cnt, nonZero_t *A); +void BuildFEMMatrixTet3D (mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, dlong *localIds, hlong* globalNumbering,int *globalOwners,dlong *cnt, nonZero_t *A); +void BuildFEMMatrixHex3D (mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, dlong *localIds, hlong* globalNumbering,int *globalOwners,dlong *cnt, nonZero_t *A); @@ -154,7 +154,7 @@ void ellipticSEMFEMSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambda) // global nodes meshParallelConnectNodes(pmesh); - //pmesh->globalIds and pmesh->globalOwners are now populated + //pmesh->globalIds is now populated } else if (elliptic->elementType==TETRAHEDRA) { @@ -208,7 +208,7 @@ void ellipticSEMFEMSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambda) // global nodes meshParallelConnectNodes(pmesh); - //pmesh->globalIds and pmesh->globalOwners are now populated + //pmesh->globalIds is now populated } @@ -378,15 +378,13 @@ void ellipticSEMFEMSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambda) dlong Ntotal = pmesh->Np*pmesh->Nelements; int verbose = options.compareArgs("VERBOSE","TRUE") ? 1:0; - hlong *globalNumbering = (hlong *) calloc(Ntotal,sizeof(hlong)); - hlong *globalStarts = (hlong *) calloc(mesh->size+1,sizeof(hlong)); - memcpy(globalNumbering,pmesh->globalIds,Ntotal*sizeof(hlong)); - + pmesh->maskedGlobalIds = (hlong *) calloc(Ntotal,sizeof(hlong)); + memcpy(pmesh->maskedGlobalIds, pmesh->globalIds, Ntotal*sizeof(hlong)); if (elliptic->elementType==TRIANGLES||elliptic->elementType==TETRAHEDRA) { //build a new mask for NpFEM>Np node sets - //on-host version of gather-scatter - pmesh->hostGsh = gsParallelGatherScatterSetup(mesh->comm, Ntotal, globalNumbering,verbose); + // gather-scatter + pmesh->ogs = ogsSetup(Ntotal, pmesh->globalIds, mesh->comm, verbose, mesh->device); //make a node-wise bc flag using the gsop (prioritize Dirichlet boundaries over Neumann) int *mapB = (int *) calloc(Ntotal,sizeof(int)); @@ -403,12 +401,12 @@ void ellipticSEMFEMSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambda) } } } - gsParallelGatherScatter(pmesh->hostGsh, mapB, "int", "min"); + ogsGatherScatter(mapB, ogsInt, ogsMin, pmesh->ogs); //use the bc flags to find masked ids for (dlong n=0;nNelements*pmesh->Np;n++) { if (mapB[n] == 1) { //Dirichlet boundary - globalNumbering[n] = -1; + pmesh->maskedGlobalIds[n] = 0; } } free(mapB); @@ -416,23 +414,41 @@ void ellipticSEMFEMSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambda) //mask using the original mask for (dlong n=0;nNmasked;n++) - globalNumbering[elliptic->maskIds[n]] = -1; + pmesh->maskedGlobalIds[elliptic->maskIds[n]] = 0; } - // squeeze node numbering - meshParallelConsecutiveGlobalNumbering(pmesh, Ntotal, globalNumbering, pmesh->globalOwners, globalStarts); + //build masked gs handle + precon->FEMogs = ogsSetup(Ntotal, pmesh->maskedGlobalIds, mesh->comm, verbose, mesh->device); + + // number of degrees of freedom on this rank (after gathering) + hlong Ngather = precon->FEMogs->Ngather; + + // create a global numbering system + hlong *globalIds = (hlong *) calloc(Ngather,sizeof(hlong)); + int *owner = (int *) calloc(Ngather,sizeof(int)); - hlong *gatherMaskedBaseIds = (hlong *) calloc(Ntotal,sizeof(hlong)); - for (dlong n=0;ngatherLocalIds[n]; - gatherMaskedBaseIds[n] = globalNumbering[id]; + // every gathered degree of freedom has its own global id + hlong *globalStarts = (hlong *) calloc(mesh->size+1,sizeof(hlong)); + MPI_Allgather(&Ngather, 1, MPI_HLONG, globalStarts+1, 1, MPI_HLONG, mesh->comm); + for(int r=0;rsize;++r) + globalStarts[r+1] = globalStarts[r]+globalStarts[r+1]; + + //use the offsets to set a consecutive global numbering + for (dlong n =0;nFEMogs->Ngather;n++) { + globalIds[n] = n + globalStarts[mesh->rank]; + owner[n] = mesh->rank; } - //build gather scatter with masked nodes - precon->FEMogs = meshParallelGatherScatterSetup(pmesh, Ntotal, - pmesh->gatherLocalIds, gatherMaskedBaseIds, - pmesh->gatherBaseRanks, pmesh->gatherHaloFlags,verbose); + //scatter this numbering to the original nodes + hlong *globalNumbering = (hlong *) calloc(Ntotal,sizeof(hlong)); + int *globalOwners = (int *) calloc(Ntotal,sizeof(int)); + for (dlong n=0;nFEMogs); + ogsScatter(globalOwners, owner, ogsInt, ogsAdd, precon->FEMogs); + + + free(globalIds); free(owner); if (elliptic->elementType==TRIANGLES||elliptic->elementType==TETRAHEDRA) { //dont need these anymore @@ -505,13 +521,13 @@ void ellipticSEMFEMSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambda) //Build unassembed non-zeros switch(elliptic->elementType){ case TRIANGLES: - BuildFEMMatrixTri2D(femMesh,pmesh,lambda, localIds, globalNumbering,&cnt,sendNonZeros); break; + BuildFEMMatrixTri2D(femMesh,pmesh,lambda, localIds, globalNumbering, globalOwners,&cnt,sendNonZeros); break; case QUADRILATERALS: - BuildFEMMatrixQuad2D(femMesh,pmesh,lambda, localIds, globalNumbering,&cnt,sendNonZeros); break; + BuildFEMMatrixQuad2D(femMesh,pmesh,lambda, localIds, globalNumbering, globalOwners,&cnt,sendNonZeros); break; case TETRAHEDRA: - BuildFEMMatrixTet3D(femMesh,pmesh,lambda, localIds, globalNumbering,&cnt,sendNonZeros); break; + BuildFEMMatrixTet3D(femMesh,pmesh,lambda, localIds, globalNumbering, globalOwners,&cnt,sendNonZeros); break; case HEXAHEDRA: - BuildFEMMatrixHex3D(femMesh,pmesh,lambda, localIds, globalNumbering,&cnt,sendNonZeros); break; + BuildFEMMatrixHex3D(femMesh,pmesh,lambda, localIds, globalNumbering, globalOwners,&cnt,sendNonZeros); break; } // Make the MPI_NONZERO_T data type @@ -646,7 +662,9 @@ void ellipticSEMFEMSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambda) } -void BuildFEMMatrixTri2D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, dlong *localIds, hlong* globalNumbering,dlong *cnt, nonZero_t *A) { +void BuildFEMMatrixTri2D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, + dlong *localIds, hlong* globalNumbering, int *globalOwners, + dlong *cnt, nonZero_t *A) { #pragma omp parallel for for (dlong e=0;eNelements;e++) { @@ -678,7 +696,7 @@ void BuildFEMMatrixTri2D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, dlong *l A[*cnt].val = val; A[*cnt].row = globalNumbering[idn]; A[*cnt].col = globalNumbering[idm]; - A[*cnt].ownerRank = pmesh->globalOwners[idn]; + A[*cnt].ownerRank = globalOwners[idn]; (*cnt)++; } } @@ -687,7 +705,9 @@ void BuildFEMMatrixTri2D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, dlong *l } } -void BuildFEMMatrixQuad2D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, dlong *localIds, hlong* globalNumbering,dlong *cnt, nonZero_t *A) { +void BuildFEMMatrixQuad2D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, + dlong *localIds, hlong* globalNumbering, int *globalOwners, + dlong *cnt, nonZero_t *A) { #pragma omp parallel for for (dlong e=0;eNelements;e++) { @@ -746,7 +766,7 @@ void BuildFEMMatrixQuad2D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, dlong * A[*cnt].val = val; A[*cnt].row = globalNumbering[idn]; A[*cnt].col = globalNumbering[idm]; - A[*cnt].ownerRank = pmesh->globalOwners[idn]; + A[*cnt].ownerRank = globalOwners[idn]; (*cnt)++; } } @@ -757,7 +777,9 @@ void BuildFEMMatrixQuad2D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, dlong * } } -void BuildFEMMatrixTet3D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, dlong *localIds, hlong* globalNumbering,dlong *cnt, nonZero_t *A) { +void BuildFEMMatrixTet3D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, + dlong *localIds, hlong* globalNumbering, int *globalOwners, + dlong *cnt, nonZero_t *A) { #pragma omp parallel for for (dlong e=0;eNelements;e++) { @@ -797,7 +819,7 @@ void BuildFEMMatrixTet3D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, dlong *l A[*cnt].val = val; A[*cnt].row = globalNumbering[idn]; A[*cnt].col = globalNumbering[idm]; - A[*cnt].ownerRank = pmesh->globalOwners[idn]; + A[*cnt].ownerRank = globalOwners[idn]; (*cnt)++; } } @@ -806,7 +828,9 @@ void BuildFEMMatrixTet3D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, dlong *l } } -void BuildFEMMatrixHex3D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, dlong *localIds, hlong* globalNumbering,dlong *cnt, nonZero_t *A) { +void BuildFEMMatrixHex3D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, + dlong *localIds, hlong* globalNumbering, int *globalOwners, + dlong *cnt, nonZero_t *A) { #pragma omp parallel for for (dlong e=0;eNelements;e++) { @@ -898,7 +922,7 @@ void BuildFEMMatrixHex3D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, dlong *l A[*cnt].val = val; A[*cnt].row = globalNumbering[idn]; A[*cnt].col = globalNumbering[idm]; - A[*cnt].ownerRank = pmesh->globalOwners[idn]; + A[*cnt].ownerRank = globalOwners[idn]; (*cnt)++; } } diff --git a/solvers/elliptic/src/ellipticSetup.c b/solvers/elliptic/src/ellipticSetup.c index 0e124b091..6b08a95b8 100644 --- a/solvers/elliptic/src/ellipticSetup.c +++ b/solvers/elliptic/src/ellipticSetup.c @@ -217,7 +217,7 @@ elliptic_t *ellipticSetup(mesh_t *mesh, dfloat lambda, occa::properties &kernelI // gather-scatter if(options.compareArgs("DISCRETIZATION","CONTINUOUS")){ - ellipticParallelGatherScatter(mesh, mesh->ogs, elliptic->o_r, dfloatString, "add"); + ogsGatherScatter(elliptic->o_r, ogsDfloat, ogsAdd, mesh->ogs); if (elliptic->Nmasked) mesh->maskKernel(elliptic->Nmasked, elliptic->o_maskIds, elliptic->o_r); } diff --git a/solvers/elliptic/src/ellipticSmootherSetup.c b/solvers/elliptic/src/ellipticSmootherSetup.c index 4bd41ea34..3202a68aa 100644 --- a/solvers/elliptic/src/ellipticSmootherSetup.c +++ b/solvers/elliptic/src/ellipticSmootherSetup.c @@ -195,7 +195,7 @@ dfloat maxEigSmoothAx(elliptic_t* elliptic, agmgLevel *level){ //gather-scatter if (options.compareArgs("DISCRETIZATION","CONTINUOUS")) { - gsParallelGatherScatter(mesh->hostGsh, Vx, dfloatString, "add"); + ogsGatherScatter(Vx, ogsDfloat, ogsAdd, mesh->ogs); for (dlong i=0;iNmasked;i++) Vx[elliptic->maskIds[i]] = 0.; } diff --git a/solvers/elliptic/src/ellipticSolveSetup.c b/solvers/elliptic/src/ellipticSolveSetup.c index e24611f23..566e59ba7 100644 --- a/solvers/elliptic/src/ellipticSolveSetup.c +++ b/solvers/elliptic/src/ellipticSolveSetup.c @@ -209,6 +209,58 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k else occa::setVerboseCompilation(false); #endif + + //setup an unmasked gs handle + int verbose = options.compareArgs("VERBOSE","TRUE") ? 1:0; + meshParallelGatherScatterSetup(mesh, Ntotal, mesh->globalIds, mesh->comm, verbose); + + //make a node-wise bc flag using the gsop (prioritize Dirichlet boundaries over Neumann) + elliptic->mapB = (int *) calloc(mesh->Nelements*mesh->Np,sizeof(int)); + for (dlong e=0;eNelements;e++) { + for (int n=0;nNp;n++) elliptic->mapB[n+e*mesh->Np] = 1E9; + for (int f=0;fNfaces;f++) { + int bc = mesh->EToB[f+e*mesh->Nfaces]; + if (bc>0) { + for (int n=0;nNfp;n++) { + int BCFlag = elliptic->BCType[bc]; + int fid = mesh->faceNodes[n+f*mesh->Nfp]; + elliptic->mapB[fid+e*mesh->Np] = mymin(BCFlag,elliptic->mapB[fid+e*mesh->Np]); + } + } + } + } + ogsGatherScatter(elliptic->mapB, ogsInt, ogsMin, mesh->ogs); + + //use the bc flags to find masked ids + elliptic->Nmasked = 0; + for (dlong n=0;nNelements*mesh->Np;n++) { + if (elliptic->mapB[n] == 1E9) { + elliptic->mapB[n] = 0.; + } else if (elliptic->mapB[n] == 1) { //Dirichlet boundary + elliptic->Nmasked++; + } + } + elliptic->o_mapB = mesh->device.malloc(mesh->Nelements*mesh->Np*sizeof(int), elliptic->mapB); + + elliptic->maskIds = (dlong *) calloc(elliptic->Nmasked, sizeof(dlong)); + elliptic->Nmasked =0; //reset + for (dlong n=0;nNelements*mesh->Np;n++) { + if (elliptic->mapB[n] == 1) elliptic->maskIds[elliptic->Nmasked++] = n; + } + if (elliptic->Nmasked) elliptic->o_maskIds = mesh->device.malloc(elliptic->Nmasked*sizeof(dlong), elliptic->maskIds); + + //make a masked version of the global id numbering + mesh->maskedGlobalIds = (hlong *) calloc(Ntotal,sizeof(hlong)); + memcpy(mesh->maskedGlobalIds, mesh->globalIds, Ntotal*sizeof(hlong)); + for (dlong n=0;nNmasked;n++) + mesh->maskedGlobalIds[elliptic->maskIds[n]] = 0; + + //use the masked ids to make another gs handle + elliptic->ogs = ogsSetup(Ntotal, mesh->maskedGlobalIds, mesh->comm, verbose, mesh->device); + elliptic->o_invDegree = elliptic->ogs->o_invDegree; + + /*preconditioner setup */ + elliptic->precon = (precon_t*) calloc(1, sizeof(precon_t)); kernelInfo["parser/" "automate-add-barriers"] = "disabled"; @@ -240,34 +292,8 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k //mesh kernels mesh->haloExtractKernel = mesh->device.buildKernel(DHOLMES "/okl/meshHaloExtract2D.okl", - "meshHaloExtract2D", - kernelInfo); - - mesh->gatherKernel = - mesh->device.buildKernel(DHOLMES "/okl/gather.okl", - "gather", - kernelInfo); - - mesh->scatterKernel = - mesh->device.buildKernel(DHOLMES "/okl/scatter.okl", - "scatter", - kernelInfo); - - mesh->gatherScatterKernel = - mesh->device.buildKernel(DHOLMES "/okl/gatherScatter.okl", - "gatherScatter", - kernelInfo); - - mesh->getKernel = - mesh->device.buildKernel(DHOLMES "/okl/get.okl", - "get", - kernelInfo); - - mesh->putKernel = - mesh->device.buildKernel(DHOLMES "/okl/put.okl", - "put", - kernelInfo); - + "meshHaloExtract2D", + kernelInfo); mesh->addScalarKernel = mesh->device.buildKernel(DHOLMES "/okl/addScalar.okl", @@ -290,44 +316,44 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k elliptic->weightedInnerProduct1Kernel = mesh->device.buildKernel(DHOLMES "/okl/weightedInnerProduct1.okl", - "weightedInnerProduct1", - kernelInfo); + "weightedInnerProduct1", + kernelInfo); elliptic->weightedInnerProduct2Kernel = mesh->device.buildKernel(DHOLMES "/okl/weightedInnerProduct2.okl", - "weightedInnerProduct2", - kernelInfo); + "weightedInnerProduct2", + kernelInfo); elliptic->innerProductKernel = mesh->device.buildKernel(DHOLMES "/okl/innerProduct.okl", - "innerProduct", - kernelInfo); + "innerProduct", + kernelInfo); elliptic->weightedNorm2Kernel = mesh->device.buildKernel(DHOLMES "/okl/weightedNorm2.okl", - "weightedNorm2", - kernelInfo); + "weightedNorm2", + kernelInfo); elliptic->norm2Kernel = mesh->device.buildKernel(DHOLMES "/okl/norm2.okl", - "norm2", - kernelInfo); + "norm2", + kernelInfo); elliptic->scaledAddKernel = mesh->device.buildKernel(DHOLMES "/okl/scaledAdd.okl", - "scaledAdd", - kernelInfo); + "scaledAdd", + kernelInfo); elliptic->dotMultiplyKernel = mesh->device.buildKernel(DHOLMES "/okl/dotMultiply.okl", - "dotMultiply", - kernelInfo); + "dotMultiply", + kernelInfo); elliptic->dotDivideKernel = mesh->device.buildKernel(DHOLMES "/okl/dotDivide.okl", - "dotDivide", - kernelInfo); + "dotDivide", + kernelInfo); // add custom defines kernelInfo["defines/" "p_NpP"]= (mesh->Np+mesh->Nfp*mesh->Nfaces); @@ -388,14 +414,14 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k elliptic->AxKernel = mesh->device.buildKernel(fileName,kernelName,dfloatKernelInfo); if(elliptic->elementType!=HEXAHEDRA){ - sprintf(kernelName, "ellipticPartialAx%s", suffix); + sprintf(kernelName, "ellipticPartialAx%s", suffix); } else{ - if(elliptic->options.compareArgs("ELEMENT MAP", "TRILINEAR")){ - sprintf(kernelName, "ellipticPartialAxTrilinear%s", suffix); - }else{ - sprintf(kernelName, "ellipticPartialAx%s", suffix); - } + if(elliptic->options.compareArgs("ELEMENT MAP", "TRILINEAR")){ + sprintf(kernelName, "ellipticPartialAxTrilinear%s", suffix); + }else{ + sprintf(kernelName, "ellipticPartialAx%s", suffix); + } } elliptic->partialAxKernel = mesh->device.buildKernel(fileName,kernelName,dfloatKernelInfo); @@ -436,59 +462,6 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k sprintf(kernelName, "ellipticPartialAxIpdg%s", suffix); elliptic->partialIpdgKernel = mesh->device.buildKernel(fileName,kernelName,kernelInfo); } - } - MPI_Barrier(mesh->comm); - } - - //on-host version of gather-scatter - int verbose = options.compareArgs("VERBOSE","TRUE") ? 1:0; - mesh->hostGsh = gsParallelGatherScatterSetup(mesh->comm, mesh->Nelements*mesh->Np, mesh->globalIds,verbose); - - // set up separate gather scatter infrastructure for halo and non halo nodes - ellipticParallelGatherScatterSetup(elliptic); - - //make a node-wise bc flag using the gsop (prioritize Dirichlet boundaries over Neumann) - elliptic->mapB = (int *) calloc(mesh->Nelements*mesh->Np,sizeof(int)); - for (dlong e=0;eNelements;e++) { - for (int n=0;nNp;n++) elliptic->mapB[n+e*mesh->Np] = 1E9; - for (int f=0;fNfaces;f++) { - int bc = mesh->EToB[f+e*mesh->Nfaces]; - if (bc>0) { - for (int n=0;nNfp;n++) { - int BCFlag = elliptic->BCType[bc]; - int fid = mesh->faceNodes[n+f*mesh->Nfp]; - elliptic->mapB[fid+e*mesh->Np] = mymin(BCFlag,elliptic->mapB[fid+e*mesh->Np]); - } - } - } - } - gsParallelGatherScatter(mesh->hostGsh, elliptic->mapB, "int", "min"); - - - //use the bc flags to find masked ids - elliptic->Nmasked = 0; - for (dlong n=0;nNelements*mesh->Np;n++) { - if (elliptic->mapB[n] == 1E9) { - elliptic->mapB[n] = 0.; - } else if (elliptic->mapB[n] == 1) { //Dirichlet boundary - elliptic->Nmasked++; - } - } - elliptic->o_mapB = mesh->device.malloc(mesh->Nelements*mesh->Np*sizeof(int), elliptic->mapB); - - elliptic->maskIds = (dlong *) calloc(elliptic->Nmasked, sizeof(dlong)); - elliptic->Nmasked =0; //reset - for (dlong n=0;nNelements*mesh->Np;n++) { - if (elliptic->mapB[n] == 1) elliptic->maskIds[elliptic->Nmasked++] = n; - } - if (elliptic->Nmasked) elliptic->o_maskIds = mesh->device.malloc(elliptic->Nmasked*sizeof(dlong), elliptic->maskIds); - - /*preconditioner setup */ - elliptic->precon = (precon_t*) calloc(1, sizeof(precon_t)); - - - for (int r=0;rsize;r++) { - if (r==mesh->rank) { sprintf(fileName, DELLIPTIC "/okl/ellipticPreconCoarsen%s.okl", suffix); sprintf(kernelName, "ellipticPreconCoarsen%s", suffix); @@ -522,7 +495,7 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k kernelInfo); } } - MPI_Barrier(mesh->comm); + MPI_Barrier(mesh->comm); } long long int pre = mesh->device.memoryAllocated(); diff --git a/solvers/gradient/makefile b/solvers/gradient/makefile index eb2ad1bc5..1e895cd41 100644 --- a/solvers/gradient/makefile +++ b/solvers/gradient/makefile @@ -9,6 +9,7 @@ include ${OCCA_DIR}/scripts/Makefile # define variables HDRDIR = ../../include +OGSDIR = ../../libs/gatherScatter # set options for this machine # specify which compilers to use for c, fortran and linking @@ -16,7 +17,7 @@ CC = mpic++ LD = mpic++ # compiler flags to be used (set to compile with debugging on) -CFLAGS = -I. -DOCCA_VERSION_1_0 $(compilerFlags) $(flags) -I$(HDRDIR) -g -D DHOLMES='"${CURDIR}/../.."' -D DGRADIENT='"${CURDIR}"' +CFLAGS = -I. -DOCCA_VERSION_1_0 $(compilerFlags) $(flags) -I$(HDRDIR) -I$(OGSDIR) -g -D DHOLMES='"${CURDIR}/../.."' -D DGRADIENT='"${CURDIR}"' # link flags to be used LDFLAGS = -DOCCA_VERSION_1_0 $(compilerFlags) $(flags) -g diff --git a/solvers/ins/makefile b/solvers/ins/makefile index 9b564a09f..ed9ac8158 100644 --- a/solvers/ins/makefile +++ b/solvers/ins/makefile @@ -10,7 +10,8 @@ include ${OCCA_DIR}/scripts/Makefile # define variables HDRDIR = ../../include -GSDIR = ../../3rdParty/gslib.github/src +GSDIR = ../../3rdParty/gslib +OGSDIR = ../../libs/gatherScatter ALMONDDIR = ../parALMOND ELLIPTICDIR = ../elliptic @@ -21,21 +22,23 @@ CC = mpic++ LD = mpic++ # compiler flags to be used (set to compile with debugging on) -CFLAGS = -I. -DOCCA_VERSION_1_0 $(compilerFlags) $(flags) -I$(HDRDIR) -I$(GSDIR) -I$(ELLIPTICDIR) -g -D DHOLMES='"${CURDIR}/../.."' -D DINS='"${CURDIR}"' +CFLAGS = -I. -DOCCA_VERSION_1_0 $(compilerFlags) $(flags) -I$(HDRDIR) -I$(OGSDIR) -I$(ELLIPTICDIR) -g -D DHOLMES='"${CURDIR}/../.."' -D DINS='"${CURDIR}"' # link flags to be used -LDFLAGS = -DOCCA_VERSION_1_0 $(compilerFlags) $(flags) -g -L../../3rdParty/gslib.github -lgs \ - -L$(ELLIPTICDIR) -lelliptic -L$(ALMONDDIR) -lparALMOND +LDFLAGS = -DOCCA_VERSION_1_0 $(compilerFlags) $(flags) -g # libraries to be linked in -LIBS = -L$(OCCA_DIR)/lib $(links) -L../../3rdParty/BlasLapack -lBlasLapack -lgfortran +LIBS = -L$(ELLIPTICDIR) -lelliptic -L$(ALMONDDIR) -lparALMOND \ + -L$(OGSDIR) -logs -L$(GSDIR)/lib -lgs \ + -L$(OCCA_DIR)/lib $(links) -L../../3rdParty/BlasLapack -lBlasLapack -lgfortran \ + INCLUDES = ins.h DEPS = $(INCLUDES) \ $(HDRDIR)/mesh.h \ $(HDRDIR)/mesh2D.h \ $(HDRDIR)/mesh3D.h \ -$(HDRDIR)/ogs_t.h \ +$(OGSDIR)/ogs.hpp \ $(ALMONDDIR)/parALMOND.h \ $(ELLIPTICDIR)/elliptic.h \ $(ELLIPTICDIR)/ellipticPrecon.h @@ -98,7 +101,6 @@ LOBJS = \ ../../src/meshParallelConnectNodes.o \ ../../src/meshParallelConnectOpt.o \ ../../src/meshParallelConsecutiveGlobalNumbering.o\ -../../src/meshParallelGatherScatter.o \ ../../src/meshParallelGatherScatterSetup.o \ ../../src/meshParallelReaderTri2D.o \ ../../src/meshParallelReaderQuad2D.o \ @@ -134,39 +136,33 @@ LOBJS = \ ../../src/occaHostMallocPinned.o \ ../../src/timer.o -COBJS = \ -../../src/gsParallelGatherScatter.o\ -../../src/gsParallelGatherScatterSetup.o\ -../../src/xxtCoarseSolve.o - -insMain:$(AOBJS) $(LOBJS) ./src/insMain.o gslibInterface - cd ../../3rdParty/BlasLapack; make -j lib; cd ../../solvers/ins - cd ../parALMOND; make -j lib; cd ../ins - cd ../elliptic; make -j lib; cd ../ins +insMain:$(AOBJS) $(LOBJS) ./src/insMain.o libblas libogs libparALMOND libelliptic $(LD) $(LDFLAGS) -o insMain ./src/insMain.o $(COBJS) $(AOBJS) $(LOBJS) $(paths) $(LIBS) lib:$(AOBJS) ar -cr libins.a $(AOBJS) -gslibInterface: - cd ../../3rdParty/gslib.github; make -j libgs.a; cd ../../solvers/ins - $(cc) $(CFLAGS) -c -o ../../src/gsParallelGatherScatter.o ../../src/gsParallelGatherScatter.c $(paths) - $(cc) $(CFLAGS) -c -o ../../src/gsParallelGatherScatterSetup.o ../../src/gsParallelGatherScatterSetup.c $(paths) - $(cc) $(CFLAGS) -c -o ../../src/xxtCoarseSolve.o ../../src/xxtCoarseSolve.c $(paths) +libogs: + cd ../../libs/gatherScatter; make -j lib; cd ../../solvers/ins + +libblas: + cd ../../3rdParty/BlasLapack; make -j lib; cd ../../solvers/ins + +libparALMOND: + cd ../parALMOND; make -j lib; cd ../ins + +libelliptic: + cd ../elliptic; make -j lib; cd ../ins all: lib insMain # what to do if user types "make clean" clean: - cd ../parALMOND; make clean; cd ../ins cd ../elliptic; make clean; cd ../ins cd ../../src; rm *.o; cd ../solvers/ins rm ./src/*.o insMain libins.a realclean: - cd ../../3rdParty/gslib.github; make clean; cd ../../solvers/ins - cd ../../3rdParty/BlasLapack; make clean; cd ../../solvers/ins - cd ../parALMOND; make clean; cd ../ins - cd ../elliptic; make clean; cd ../ins + cd ../elliptic; make realclean; cd ../ins cd ../../src; rm *.o; cd ../solvers/ins rm ./src/*.o insMain libins.a diff --git a/solvers/ins/src/insDiffusion.c b/solvers/ins/src/insDiffusion.c index 9aee94efa..f5a7d1f3d 100644 --- a/solvers/ins/src/insDiffusion.c +++ b/solvers/ins/src/insDiffusion.c @@ -36,9 +36,9 @@ void insDiffusion(ins_t *ins, dfloat time, occa::memory o_U, occa::memory o_LU){ if(options.compareArgs("DISCRETIZATION", "CONTINUOUS")){ ogs_t *ogs = mesh->ogs; - if(ogs->NhaloGather) { - ins->diffusionKernel(uSolver->NglobalGatherElements, - uSolver->o_globalGatherElementList, + if(mesh->NglobalGatherElements) + ins->diffusionKernel(mesh->NglobalGatherElements, + mesh->o_globalGatherElementList, mesh->o_ggeo, mesh->o_vgeo, mesh->o_sgeo, @@ -56,21 +56,12 @@ void insDiffusion(ins_t *ins, dfloat time, occa::memory o_U, occa::memory o_LU){ o_U, o_LU); - mesh->device.finish(); - mesh->device.setStream(uSolver->dataStream); - mesh->gatherKernel(ogs->NhaloGather, - ogs->o_haloGatherOffsets, - ogs->o_haloGatherLocalIds, - o_LU, - ins->NVfields, - ins->fieldOffset, - ins->o_velocityHaloGatherTmp); - ins->o_velocityHaloGatherTmp.copyTo(ins->velocityHaloGatherTmp,"async: true"); - mesh->device.setStream(uSolver->defaultStream); - } - if(uSolver->NlocalGatherElements){ - ins->diffusionKernel(uSolver->NlocalGatherElements, - uSolver->o_localGatherElementList, + ogsGatherScatterManyStart(o_LU, ins->dim, ins->fieldOffset, + ogsDfloat, ogsAdd, ogs); + + if(mesh->NlocalGatherElements) + ins->diffusionKernel(mesh->NlocalGatherElements, + mesh->o_localGatherElementList, mesh->o_ggeo, mesh->o_vgeo, mesh->o_sgeo, @@ -87,43 +78,9 @@ void insDiffusion(ins_t *ins, dfloat time, occa::memory o_U, occa::memory o_LU){ ins->fieldOffset, o_U, o_LU); - } - - // finalize gather using local and global contributions - if(ogs->NnonHaloGather) - mesh->gatherScatterKernel(ogs->NnonHaloGather, - ogs->o_nonHaloGatherOffsets, - ogs->o_nonHaloGatherLocalIds, - ins->NVfields, - ins->fieldOffset, - o_LU); - - // C0 halo gather-scatter (on data stream) - if(ogs->NhaloGather) { - mesh->device.setStream(uSolver->dataStream); - mesh->device.finish(); - - // MPI based gather scatter using libgs - gsVecParallelGatherScatter(ogs->haloGsh, ins->velocityHaloGatherTmp, ins->NVfields, dfloatString, "add"); - - // copy totally gather halo data back from HOST to DEVICE - ins->o_velocityHaloGatherTmp.copyFrom(ins->velocityHaloGatherTmp,"async: true"); - - // do scatter back to local nodes - mesh->scatterKernel(ogs->NhaloGather, - ogs->o_haloGatherOffsets, - ogs->o_haloGatherLocalIds, - ins->NVfields, - ins->fieldOffset, - ins->o_velocityHaloGatherTmp, - o_LU); - mesh->device.setStream(uSolver->defaultStream); - } - mesh->device.finish(); - mesh->device.setStream(uSolver->dataStream); - mesh->device.finish(); - mesh->device.setStream(uSolver->defaultStream); + ogsGatherScatterManyFinish(o_LU, ins->dim, ins->fieldOffset, + ogsDfloat, ogsAdd, ogs); } else if(options.compareArgs("DISCRETIZATION", "IPDG")) { dlong offset = 0; diff --git a/solvers/ins/src/insPressureSolve.c b/solvers/ins/src/insPressureSolve.c index 94e9f250d..fdf2a908c 100644 --- a/solvers/ins/src/insPressureSolve.c +++ b/solvers/ins/src/insPressureSolve.c @@ -80,7 +80,7 @@ void insPressureSolve(ins_t *ins, dfloat time, int stage){ // gather-scatter if(ins->pOptions.compareArgs("DISCRETIZATION","CONTINUOUS")){ - ellipticParallelGatherScatter(mesh, mesh->ogs, ins->o_rhsP, dfloatString, "add"); + ogsGatherScatter(ins->o_rhsP, ogsDfloat, ogsAdd, mesh->ogs); if (solver->Nmasked) mesh->maskKernel(solver->Nmasked, solver->o_maskIds, ins->o_rhsP); if (solver->Nmasked) mesh->maskKernel(solver->Nmasked, solver->o_maskIds, ins->o_PI); } diff --git a/solvers/ins/src/insReport.c b/solvers/ins/src/insReport.c index 0f1f006e9..e8eba1be2 100644 --- a/solvers/ins/src/insReport.c +++ b/solvers/ins/src/insReport.c @@ -45,18 +45,18 @@ void insReport(ins_t *ins, dfloat time, int tstep){ ins->o_Div); // gatherscatter vorticity field + dlong Ntotal = (mesh->Nelements+mesh->totalHaloPairs)*mesh->Np; for(int s=0; sdim; s++){ - dlong Ntotal = (mesh->Nelements+mesh->totalHaloPairs)*mesh->Np; ins->o_UH.copyFrom(ins->o_Vort,Ntotal*sizeof(dfloat),0,s*ins->fieldOffset*sizeof(dfloat)); - - ellipticParallelGatherScatter(mesh, mesh->ogs, ins->o_UH, dfloatString, "add"); + + ogsGatherScatter(ins->o_UH, ogsDfloat, ogsAdd, mesh->ogs); ins->pSolver->dotMultiplyKernel(mesh->Nelements*mesh->Np, mesh->ogs->o_invDegree, ins->o_UH, ins->o_UH); ins->o_UH.copyTo(ins->o_Vort,Ntotal*sizeof(dfloat),s*ins->fieldOffset*sizeof(dfloat),0); } // gather-scatter divergence - ellipticParallelGatherScatter(mesh, mesh->ogs, ins->o_Div, dfloatString, "add"); + ogsGatherScatter(ins->o_Div, ogsDfloat, ogsAdd, mesh->ogs); ins->pSolver->dotMultiplyKernel(mesh->Nelements*mesh->Np, mesh->ogs->o_invDegree, ins->o_Div, ins->o_Div); // copy data back to host diff --git a/solvers/ins/src/insSetup.c b/solvers/ins/src/insSetup.c index 4aace8576..2043d440c 100644 --- a/solvers/ins/src/insSetup.c +++ b/solvers/ins/src/insSetup.c @@ -645,8 +645,8 @@ ins_t *insSetup(mesh_t *mesh, setupAide options){ } } } - gsParallelGatherScatter(mesh->hostGsh, ins->VmapB, "int", "min"); - gsParallelGatherScatter(mesh->hostGsh, ins->PmapB, "int", "max"); + ogsGatherScatter(ins->VmapB, ogsInt, ogsMin, mesh->ogs); + ogsGatherScatter(ins->PmapB, ogsInt, ogsMax, mesh->ogs); for (int n=0;nNelements*mesh->Np;n++) { if (ins->VmapB[n] == 1E9) { diff --git a/solvers/ins/src/insVelocitySolve.c b/solvers/ins/src/insVelocitySolve.c index de64558cb..e28b4f04d 100644 --- a/solvers/ins/src/insVelocitySolve.c +++ b/solvers/ins/src/insVelocitySolve.c @@ -58,10 +58,10 @@ void insVelocitySolve(ins_t *ins, dfloat time, int stage, occa::memory o_rhsU, o_rhsW); // gather-scatter - ellipticParallelGatherScatter(mesh, mesh->ogs, o_rhsU, dfloatString, "add"); - ellipticParallelGatherScatter(mesh, mesh->ogs, o_rhsV, dfloatString, "add"); + ogsGatherScatter(o_rhsU, ogsDfloat, ogsAdd, mesh->ogs); + ogsGatherScatter(o_rhsV, ogsDfloat, ogsAdd, mesh->ogs); if (ins->dim==3) - ellipticParallelGatherScatter(mesh, mesh->ogs, o_rhsW, dfloatString, "add"); + ogsGatherScatter(o_rhsW, ogsDfloat, ogsAdd, mesh->ogs); if (usolver->Nmasked) mesh->maskKernel(usolver->Nmasked, usolver->o_maskIds, o_rhsU); if (vsolver->Nmasked) mesh->maskKernel(vsolver->Nmasked, vsolver->o_maskIds, o_rhsV); if (ins->dim==3) diff --git a/solvers/parALMOND/makefile b/solvers/parALMOND/makefile index bc15fd167..385eb8d5d 100644 --- a/solvers/parALMOND/makefile +++ b/solvers/parALMOND/makefile @@ -9,11 +9,10 @@ includes = $(wildcard $(iDir)/*h) objects = $(subst $(sDir)/,$(objDir)/,$(sources:.c=.o)) deps = $(includes) \ ../../include/mesh.h \ -../../include/ogs_t.h \ -../../include/hgs_t.h \ +../../libs/gatherScatter/ogs.hpp \ ../../include/parAlmond.h -flags = -DOCCA_VERSION_1_0 -I${OCCA_DIR}/include -I$(iDir) -I../../include +flags = -DOCCA_VERSION_1_0 -I${OCCA_DIR}/include -I$(iDir) -I../../include -I../../libs/gatherScatter libs = -L${OCCA_DIR}/lib -locca -llapack -lblas flags += -D DPWD='"${CURDIR}"' diff --git a/src/meshParallelConnectNodes.c b/src/meshParallelConnectNodes.c index f66a5e455..7368384b9 100644 --- a/src/meshParallelConnectNodes.c +++ b/src/meshParallelConnectNodes.c @@ -31,81 +31,20 @@ SOFTWARE. #include "mesh.h" -// int rank for this process (not host thread safe) -int localRank = -1; - typedef struct{ - dlong element; // local element id - int node; // local node id - int rank; // rank of original node - hlong id; // original id - int haloFlag; - - // info on base node (lowest rank node) - dlong baseElement; - int baseNode; int baseRank; hlong baseId; - hlong newGlobalId; - }parallelNode_t; -// compare on base rank then by globalId -int parallelCompareOwners(const void *a, const void *b){ - - parallelNode_t *fa = (parallelNode_t*) a; - parallelNode_t *fb = (parallelNode_t*) b; - - if(fa->baseRank < fb->baseRank) return -1; - if(fa->baseRank > fb->baseRank) return +1; - - return 0; -} - -// compare on base rank then by globalId -int parallelCompareSourceRank(const void *a, const void *b){ - - parallelNode_t *fa = (parallelNode_t*) a; - parallelNode_t *fb = (parallelNode_t*) b; - - if(fa->rank < fb->rank) return -1; - if(fa->rank > fb->rank) return +1; - - return 0; -} - -// compare on base rank for sorting suitable for destination -int parallelCompareBaseNodes(const void *a, const void *b){ - - parallelNode_t *fa = (parallelNode_t*) a; - parallelNode_t *fb = (parallelNode_t*) b; - int rank = localRank; - - if ((fa->baseRank==rank)&&(fb->baseRank!=rank)) return -1; //move locally-owned nodes to the beginning - if ((fa->baseRank!=rank)&&(fb->baseRank==rank)) return 1; - - if(fa->baseRank < fb->baseRank) return -1; - if(fa->baseRank > fb->baseRank) return +1; - - if(fa->haloFlag < fb->haloFlag) return -1; - if(fa->haloFlag > fb->haloFlag) return +1; - - if(fa->baseId < fb->baseId) return -1; - if(fa->baseId > fb->baseId) return +1; - - return 0; -} - -// iteratively find a gather numbering for all local element nodes +// uniquely label each node with a global index, used for gatherScatter void meshParallelConnectNodes(mesh_t *mesh){ int rank, size; rank = mesh->rank; size = mesh->size; - localRank = rank; dlong localNodeCount = mesh->Np*mesh->Nelements; dlong *allLocalNodeCounts = (dlong*) calloc(size, sizeof(dlong)); @@ -121,7 +60,7 @@ void meshParallelConnectNodes(mesh_t *mesh){ free(allLocalNodeCounts); // form continuous node numbering (local=>virtual gather) - parallelNode_t *sendNodes = + parallelNode_t *localNodes = (parallelNode_t*) calloc((mesh->totalHaloPairs+mesh->Nelements)*mesh->Np, sizeof(parallelNode_t)); @@ -129,15 +68,9 @@ void meshParallelConnectNodes(mesh_t *mesh){ for(dlong e=0;eNelements;++e){ for(int n=0;nNp;++n){ dlong id = e*mesh->Np+n; - sendNodes[id].element = e; - sendNodes[id].node = n; - sendNodes[id].rank = rank; - sendNodes[id].id = 1 + id + mesh->Nnodes + gatherNodeStart; - sendNodes[id].baseElement = e; - sendNodes[id].baseNode = n; - sendNodes[id].baseRank = rank; - sendNodes[id].baseId = 1 + id + mesh->Nnodes + gatherNodeStart; + localNodes[id].baseRank = rank; + localNodes[id].baseId = 1 + id + mesh->Nnodes + gatherNodeStart; } @@ -145,18 +78,7 @@ void meshParallelConnectNodes(mesh_t *mesh){ for(int v=0;vNverts;++v){ dlong id = e*mesh->Np + mesh->vertexNodes[v]; hlong gid = mesh->EToV[e*mesh->Nverts+v] + 1; - sendNodes[id].id = gid; - sendNodes[id].baseId = gid; - } - - // label halo flags - for(int f=0;fNfaces;++f){ - if(mesh->EToP[e*mesh->Nfaces+f]!=-1){ - for(int n=0;nNfp;++n){ - dlong id = e*mesh->Np+mesh->faceNodes[f*mesh->Nfp+n]; - sendNodes[id].haloFlag = 1; - } - } + localNodes[id].baseId = gid; } } @@ -173,7 +95,7 @@ void meshParallelConnectNodes(mesh_t *mesh){ // send halo data and recv into extension of buffer meshHaloExchange(mesh, mesh->Np*sizeof(parallelNode_t), - sendNodes, sendBuffer, sendNodes+localNodeCount); + localNodes, sendBuffer, localNodes+localNodeCount); // compare trace nodes for(dlong e=0;eNelements;++e){ @@ -181,35 +103,22 @@ void meshParallelConnectNodes(mesh_t *mesh){ dlong id = e*mesh->Nfp*mesh->Nfaces + n; dlong idM = mesh->vmapM[id]; dlong idP = mesh->vmapP[id]; - hlong gidM = sendNodes[idM].baseId; - hlong gidP = sendNodes[idP].baseId; + hlong gidM = localNodes[idM].baseId; + hlong gidP = localNodes[idP].baseId; - int baseRankM = sendNodes[idM].baseRank; - int baseRankP = sendNodes[idP].baseRank; - - // use minimum of trace variables - int haloM = sendNodes[idM].haloFlag; - int haloP = sendNodes[idP].haloFlag; - if(haloM!=haloP) { - ++localChange; - sendNodes[idM].haloFlag = mymax(haloM, haloP); - sendNodes[idP].haloFlag = mymax(haloM, haloP); - } + int baseRankM = localNodes[idM].baseRank; + int baseRankP = localNodes[idP].baseRank; if(gidMcomm); } - // sort based on base nodes (rank,element,node at base) - qsort(sendNodes, localNodeCount, sizeof(parallelNode_t), parallelCompareOwners); - - // Make the MPI_PARALLELNODE_T data type - MPI_Datatype MPI_PARALLELNODE_T; - MPI_Datatype dtype[10] = {MPI_DLONG, MPI_INT, MPI_INT, MPI_HLONG, MPI_INT, - MPI_DLONG, MPI_INT, MPI_INT, MPI_HLONG, MPI_HLONG}; - int blength[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - MPI_Aint addr[10], displ[10]; - MPI_Get_address ( &(sendNodes[0] ), addr+0); - MPI_Get_address ( &(sendNodes[0].node ), addr+1); - MPI_Get_address ( &(sendNodes[0].rank ), addr+2); - MPI_Get_address ( &(sendNodes[0].id ), addr+3); - MPI_Get_address ( &(sendNodes[0].haloFlag ), addr+4); - MPI_Get_address ( &(sendNodes[0].baseElement), addr+5); - MPI_Get_address ( &(sendNodes[0].baseNode ), addr+6); - MPI_Get_address ( &(sendNodes[0].baseRank ), addr+7); - MPI_Get_address ( &(sendNodes[0].baseId ), addr+8); - MPI_Get_address ( &(sendNodes[0].newGlobalId), addr+9); - displ[0] = 0; - displ[1] = addr[1] - addr[0]; - displ[2] = addr[2] - addr[0]; - displ[3] = addr[3] - addr[0]; - displ[4] = addr[4] - addr[0]; - displ[5] = addr[5] - addr[0]; - displ[6] = addr[6] - addr[0]; - displ[7] = addr[7] - addr[0]; - displ[8] = addr[8] - addr[0]; - displ[9] = addr[9] - addr[0]; - MPI_Type_create_struct (10, blength, displ, dtype, &MPI_PARALLELNODE_T); - MPI_Type_commit (&MPI_PARALLELNODE_T); - - // count how many nodes to send to each process - int *sendCounts = (int *) calloc(size,sizeof(int)); - int *recvCounts = (int *) calloc(size,sizeof(int)); - int *sendOffsets = (int *) calloc(size+1,sizeof(int)); - int *recvOffsets = (int *) calloc(size+1,sizeof(int)); - - for(dlong n=0;ncomm); - - // find send and recv offsets for gather - dlong recvNtotal = 0; - for(int r=0;rcomm); - - // sort by global index shifting halo nodes to the end - qsort(recvNodes, recvNtotal, sizeof(parallelNode_t), parallelCompareBaseNodes); - - // renumber unique nodes starting from 0 (need to be careful about zeros) - dlong Ngather = 0; - if (recvNtotal) recvNodes[0].newGlobalId = Ngather; - for(dlong n=1;ncomm); - - // cumulative sum of unique node counts => starting node index for each process - mesh->gatherGlobalStarts = (hlong*) calloc(size+1, sizeof(hlong)); - for(int r=0;rgatherGlobalStarts[r+1] = mesh->gatherGlobalStarts[r] + allGather[r]; - - // shift numbering - for(dlong n=0;ngatherGlobalStarts[rank]; - - // sort by rank, local index - qsort(recvNodes, recvNtotal, sizeof(parallelNode_t), parallelCompareSourceRank); - - // reverse all to all to reclaim nodes - MPI_Alltoallv(recvNodes, recvCounts, recvOffsets, MPI_PARALLELNODE_T, - sendNodes, sendCounts, sendOffsets, MPI_PARALLELNODE_T, - mesh->comm); - - // sort by rank, local index - qsort(sendNodes, localNodeCount, sizeof(parallelNode_t), parallelCompareBaseNodes); - - // extract base index of each node (i.e. gather numbering) - mesh->gatherLocalIds = (dlong*) calloc(localNodeCount, sizeof(dlong)); - mesh->gatherBaseIds = (hlong*) calloc(localNodeCount, sizeof(hlong)); - mesh->gatherBaseRanks = (int*) calloc(localNodeCount, sizeof(int)); - mesh->gatherHaloFlags = (int*) calloc(localNodeCount, sizeof(int)); - + //make a locally-ordered version + mesh->globalIds = (hlong*) calloc(localNodeCount, sizeof(hlong)); for(dlong id=0;idgatherLocalIds[id] = sendNodes[id].element*mesh->Np+sendNodes[id].node; - mesh->gatherBaseIds[id] = sendNodes[id].newGlobalId+1; - mesh->gatherBaseRanks[id] = sendNodes[id].baseRank; - mesh->gatherHaloFlags[id] = sendNodes[id].haloFlag; + mesh->globalIds[id] = localNodes[id].baseId; } - MPI_Barrier(mesh->comm); - MPI_Type_free(&MPI_PARALLELNODE_T); + free(localNodes); free(sendBuffer); - free(sendNodes); - - //make a locally-ordered version - mesh->globalIds = (hlong*) calloc(localNodeCount, sizeof(hlong)); - mesh->globalOwners = (int*) calloc(localNodeCount, sizeof(int)); - mesh->globalHaloFlags= (int*) calloc(localNodeCount, sizeof(int)); - - for(dlong id=0;idgatherLocalIds[id]; - mesh->globalIds[localId] = mesh->gatherBaseIds[id]; - mesh->globalOwners[localId] = mesh->gatherBaseRanks[id]; - mesh->globalHaloFlags[localId]= mesh->gatherHaloFlags[id]; - } } diff --git a/src/meshParallelGatherScatterSetup.c b/src/meshParallelGatherScatterSetup.c index 7c4d72f50..7d4bf3515 100644 --- a/src/meshParallelGatherScatterSetup.c +++ b/src/meshParallelGatherScatterSetup.c @@ -30,165 +30,76 @@ SOFTWARE. #include "mesh.h" -// assume nodes locally sorted by rank then global index -// assume gather and scatter are the same sets -ogs_t *meshParallelGatherScatterSetup(mesh_t *mesh, - dlong Nlocal, - dlong *gatherLocalIds, - hlong *gatherBaseIds, - int *gatherBaseRanks, - int *gatherHaloFlags, +void meshParallelGatherScatterSetup(mesh_t *mesh, + dlong N, + dlong *globalIds, + MPI_Comm &comm, int verbose) { - int rank; - rank = mesh->rank; - - ogs_t *ogs = (ogs_t*) calloc(1, sizeof(ogs_t)); - - // 0. squeeze out negative globalIds - hlong *baseIds = (hlong*) calloc(Nlocal,sizeof(hlong)); - dlong *localIds = (dlong*) calloc(Nlocal,sizeof(dlong)); - int *baseRanks = (int*) calloc(Nlocal,sizeof(int)); - int *haloFlags = (int*) calloc(Nlocal,sizeof(int)); - - dlong Ntotal = 0; - dlong Nhalo = 0; - dlong NnonHalo = 0; - for (dlong n=0;nogs = ogsSetup(N, globalIds, comm, verbose, mesh->device); - // ------------------------------------------------------------ - // 1. count number of unique base nodes on this process - ogs->NhaloGather = 0; - ogs->NnonHaloGather = 0; - ogs->NownedHalo = 0; - for(dlong n=0;nNtotalGather += test; - if (haloFlags[n]==1) ogs->NhaloGather += test; - if (haloFlags[n]!=1) ogs->NnonHaloGather += test; - if ((haloFlags[n]==1)&&(baseRanks[n]==rank)) ogs->NownedHalo +=test; + //use the gs to find what nodes are local to this rank + int *minRank = (int *) calloc(N,sizeof(int)); + int *maxRank = (int *) calloc(N,sizeof(int)); + for (dlong i=0;ihaloGatherBaseIds = (hlong*) calloc(ogs->NhaloGather, sizeof(hlong)); // offset into sorted list of nodes - ogs->haloGatherOffsets = (dlong*) calloc(ogs->NhaloGather+1, sizeof(dlong)); // offset into sorted list of nodes - ogs->haloGatherLocalIds = (dlong*) calloc(Nhalo, sizeof(dlong)); - - ogs->ownedHaloGatherIds = (dlong*) calloc(ogs->NhaloGather, sizeof(dlong)); - - ogs->nonHaloGatherBaseIds = (hlong*) calloc(ogs->NnonHaloGather, sizeof(hlong)); // offset into sorted list of nodes - ogs->nonHaloGatherOffsets = (dlong*) calloc(ogs->NnonHaloGather+1, sizeof(dlong)); // offset into sorted list of nodes - ogs->nonHaloGatherLocalIds = (dlong*) calloc(NnonHalo, sizeof(dlong)); - - dlong haloOffset = ogs->NnonHaloGather; - - // only finds bases - ogs->NhaloGather = 0; - ogs->NnonHaloGather = 0; - Nhalo = 0; - NnonHalo = 0; - for(dlong n=0;nhaloGatherOffsets[ogs->NhaloGather] = Nhalo; - ogs->haloGatherBaseIds[ogs->NhaloGather] = baseIds[n]+1; - - if (baseRanks[n]==rank) { - ogs->ownedHaloGatherIds[ogs->NhaloGather] = haloOffset++; - } else { - ogs->ownedHaloGatherIds[ogs->NhaloGather] = -1; - } - - ogs->NhaloGather++; - } - ogs->haloGatherLocalIds[Nhalo] = localIds[n]; - Nhalo++; - } else { - if(test){ - ogs->nonHaloGatherOffsets[ogs->NnonHaloGather] = NnonHalo; - ogs->nonHaloGatherBaseIds[ogs->NnonHaloGather] = baseIds[n]+1; - ogs->NnonHaloGather++; + ogsGatherScatter(minRank, ogsInt, ogsMin, mesh->ogs); //minRank[n] contains the smallest rank taking part in the gather of node n + ogsGatherScatter(maxRank, ogsInt, ogsMax, mesh->ogs); //maxRank[n] contains the largest rank taking part in the gather of node n + + // count elements that contribute to global C0 gather-scatter + dlong globalCount = 0; + dlong localCount = 0; + for(dlong e=0;eNelements;++e){ + int isHalo = 0; + for(int n=0;nNp;++n){ + dlong id = e*mesh->Np+n; + if ((minRank[id]!=rank)||(maxRank[id]!=rank)) { + isHalo = 1; + break; } - ogs->nonHaloGatherLocalIds[NnonHalo] = localIds[n]; - NnonHalo++; } + globalCount += isHalo; + localCount += 1-isHalo; } - ogs->haloGatherOffsets[ogs->NhaloGather] = Nhalo; - ogs->nonHaloGatherOffsets[ogs->NnonHaloGather] = NnonHalo; - - - // if there are halo nodes to gather - if(ogs->NhaloGather){ -#if 0 - occa::memory o_gatherTmpPinned = mesh->device.mappedAlloc(ogs->NhaloGather*sizeof(dfloat), NULL); - ogs->haloGatherTmp = (dfloat*) o_gatherTmpPinned.getMappedPointer(); // (char*) calloc(ogs->NhaloGather*sizeof(dfloat), sizeof(char)); -#else - ogs->haloGatherTmp = (dfloat*) occaHostMallocPinned(mesh->device, ogs->NhaloGather*sizeof(dfloat), NULL, ogs->o_haloGatherTmp); - ogs->o_haloGatherOffsets = mesh->device.malloc((ogs->NhaloGather+1)*sizeof(dlong), ogs->haloGatherOffsets); - ogs->o_haloGatherLocalIds = mesh->device.malloc(Nhalo*sizeof(dlong), ogs->haloGatherLocalIds); -#endif + mesh->globalGatherElementList = (dlong*) calloc(globalCount, sizeof(dlong)); + mesh->localGatherElementList = (dlong*) calloc(localCount, sizeof(dlong)); - ogs->o_ownedHaloGatherIds = mesh->device.malloc(ogs->NhaloGather*sizeof(dlong), ogs->ownedHaloGatherIds); + globalCount = 0; + localCount = 0; - // initiate gslib gather-scatter comm pattern on halo nodes only - ogs->haloGsh = gsParallelGatherScatterSetup(mesh->comm, ogs->NhaloGather, ogs->haloGatherBaseIds,verbose); - } - - // if there are non-halo nodes to gather - if(ogs->NnonHaloGather){ - ogs->o_nonHaloGatherOffsets = mesh->device.malloc((ogs->NnonHaloGather+1)*sizeof(dlong), ogs->nonHaloGatherOffsets); - ogs->o_nonHaloGatherLocalIds = mesh->device.malloc(NnonHalo*sizeof(dlong), ogs->nonHaloGatherLocalIds); + for(dlong e=0;eNelements;++e){ + int isHalo = 0; + for(int n=0;nNp;++n){ + dlong id = e*mesh->Np+n; + if ((minRank[id]!=rank)||(maxRank[id]!=rank)) { + isHalo = 1; + break; + } + } + if(isHalo){ + mesh->globalGatherElementList[globalCount++] = e; + } else{ + mesh->localGatherElementList[localCount++] = e; + } } + //printf("local = %d, global = %d\n", localCount, globalCount); - //number of owned gathered nodes - ogs->Ngather = ogs->NnonHaloGather+ogs->NownedHalo; - - // build degree vectors - ogs->invDegree = (dfloat*) calloc(Nlocal, sizeof(dfloat)); - ogs->gatherInvDegree = (dfloat*) calloc(ogs->Ngather, sizeof(dfloat)); - for(dlong n=0;ninvDegree[n] = 1; - - ogs->o_invDegree = mesh->device.malloc(Nlocal*sizeof(dfloat), ogs->invDegree); - ogs->o_gatherInvDegree = mesh->device.malloc(ogs->Ngather*sizeof(dfloat), ogs->gatherInvDegree); - - meshParallelGather(mesh, ogs, ogs->o_invDegree, ogs->o_gatherInvDegree); - - if(ogs->Ngather) - ogs->o_gatherInvDegree.copyTo(ogs->gatherInvDegree); - - meshParallelScatter(mesh, ogs, ogs->o_gatherInvDegree, ogs->o_invDegree); - - ogs->o_invDegree.copyTo(ogs->invDegree); - - for(dlong n=0;ninvDegree[n] = 1./ogs->invDegree[n]; - - for(dlong n=0;nNgather;++n) - ogs->gatherInvDegree[n] = 1./ogs->gatherInvDegree[n]; - - if(ogs->Ngather) - ogs->o_gatherInvDegree.copyFrom(ogs->gatherInvDegree); - - if(Nlocal) - ogs->o_invDegree.copyFrom(ogs->invDegree); + mesh->NglobalGatherElements = globalCount; + mesh->NlocalGatherElements = localCount; - free(baseIds ); - free(localIds ); - free(baseRanks); - free(haloFlags); + if(globalCount) + mesh->o_globalGatherElementList = + mesh->device.malloc(globalCount*sizeof(dlong), mesh->globalGatherElementList); - return ogs; + if(localCount) + mesh->o_localGatherElementList = + mesh->device.malloc(localCount*sizeof(dlong), mesh->localGatherElementList); } diff --git a/tests/ogs/bns/data/bnsUniform2D.h b/tests/ogs/bns/data/bnsUniform2D.h new file mode 100644 index 000000000..d72cbeae4 --- /dev/null +++ b/tests/ogs/bns/data/bnsUniform2D.h @@ -0,0 +1,83 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + + +// Boundary conditions +/* wall 1, inflow 2, outflow 3, x-slip 4, y-slip 5 */ +#define boundaryConditionsPML2D(bc, t, x, y, nx, ny, intfx, intfy, q1M, q2M, q3M, q4M, q5M, q6M, q1B, q2B, q3B, q4B, q5B, q6B) \ + { \ + if(bc==1){ \ + *(q1B) = q1M; \ + *(q2B) = -q2M; \ + *(q3B) = -q3M; \ + *(q4B) = q4M; \ + *(q5B) = q5M; \ + *(q6B) = q6M; \ + } else if(bc==4||bc==5){ \ + *(q1B) = q1M; \ + *(q2B) = q2M-2.f*(nx*q2M+ny*q3M)*nx; \ + *(q3B) = q3M-2.f*(nx*q2M+ny*q3M)*ny; \ + *(q4B) = q4M; \ + *(q5B) = q5M; \ + *(q6B) = q6M; \ + } \ + else { \ + *(q1B) = 2.f*p_q1bar - q1M; \ + *(q2B) = 2.f*p_q1bar*intfx*p_isqrtRT - q2M; \ + *(q3B) = 2.f*p_q1bar*intfy*p_isqrtRT - q3M; \ + *(q4B) = 2.f*p_q1bar*intfx*intfy*p_isqrtRT*p_isqrtRT-q4M; \ + *(q5B) = 2.f*p_q1bar*intfx*intfx*p_isqrtRT*p_isqrtRT*p_invsqrt2-q5M; \ + *(q6B) = 2.f*p_q1bar*intfy*intfy*p_isqrtRT*p_isqrtRT*p_invsqrt2-q6M; \ + } \ + } + +// Boundary conditions: Did not check yet +/* wall 1, inflow 2, outflow 3, x-slip 4, y-slip 5 */ +#define boundaryConditions2D(bc, t, x, y, nx, ny, intfx, intfy, q1M, q2M, q3M, q4M, q5M, q6M, q1B, q2B, q3B, q4B, q5B, q6B) \ + { \ + if(bc==1){ \ + *(q1B) = q1M; \ + *(q2B) = -q2M; \ + *(q3B) = -q3M; \ + *(q4B) = q4M; \ + *(q5B) = q5M; \ + *(q6B) = q6M; \ + } else if(bc==2 || bc==3){ \ + *(q1B) = 2.f*p_q1bar - q1M; \ + *(q2B) = 2.f*p_q1bar*intfx*p_isqrtRT - q2M; \ + *(q3B) = 2.f*p_q1bar*intfy*p_isqrtRT - q3M; \ + *(q4B) = 2.f*p_q1bar*intfx*intfy*p_isqrtRT*p_isqrtRT-q4M; \ + *(q5B) = 2.f*p_q1bar*intfx*intfx*p_isqrtRT*p_isqrtRT*p_invsqrt2-q5M; \ + *(q6B) = 2.f*p_q1bar*intfy*intfy*p_isqrtRT*p_isqrtRT*p_invsqrt2-q6M; \ + } else if(bc==4||bc==5){ \ + *(q1B) = q1M; \ + *(q2B) = q2M-2.f*(nx*q2M+ny*q3M)*nx; \ + *(q3B) = q3M-2.f*(nx*q2M+ny*q3M)*ny; \ + *(q4B) = q4M; \ + *(q5B) = q5M; \ + *(q6B) = q6M; \ + } \ + } diff --git a/tests/ogs/bns/data/bnsUniform3D.h b/tests/ogs/bns/data/bnsUniform3D.h new file mode 100644 index 000000000..323da1079 --- /dev/null +++ b/tests/ogs/bns/data/bnsUniform3D.h @@ -0,0 +1,106 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + + +// Boundary conditions +/* wall 1, inflow 2, outflow 3, x-slip 4, y-slip 5 */ +#define boundaryConditionsPML3D(bc, t, x, y, z, nx, ny, nz, intfx, intfy, intfz, q1M, q2M, q3M, q4M, q5M, q6M, q7M, q8M, q9M, q10M, q1B, q2B, q3B, q4B, q5B, q6B, q7B, q8B, q9B, q10B) \ +{ \ + if(bc==1){ \ + *(q1B) = q1M; \ + *(q2B) = -q2M; \ + *(q3B) = -q3M; \ + *(q4B) = -q4M; \ + *(q5B) = q5M; \ + *(q6B) = q6M; \ + *(q7B) = q7M; \ + *(q8B) = q8M; \ + *(q9B) = q9M; \ + *(q10B) = q10M; \ + } else if(bc==4||bc==5){ \ + *(q1B) = q1M; \ + *(q2B) = q2M-2.f*(nx*q2M+ny*q3M*nz*q4M)*nx;\ + *(q3B) = q3M-2.f*(nx*q2M+ny*q3M*nz*q4M)*ny;\ + *(q4B) = q4M-2.f*(nx*q2M+ny*q3M*nz*q4M)*nz;\ + *(q5B) = q5M; \ + *(q6B) = q6M; \ + *(q7B) = q7M; \ + *(q8B) = q8M; \ + *(q9B) = q9M; \ + *(q10B) = q10M; \ + }else{ \ + *(q1B) = 2.f*p_q1bar - q1M; \ + *(q2B) = 2.f*p_q1bar*intfx*p_isqrtRT - q2M; \ + *(q3B) = 2.f*p_q1bar*intfy*p_isqrtRT - q3M; \ + *(q4B) = 2.f*p_q1bar*intfz*p_isqrtRT - q4M; \ + *(q5B) = 2.f*p_q1bar*intfx*intfy*p_isqrtRT*p_isqrtRT-q5M; \ + *(q6B) = 2.f*p_q1bar*intfx*intfz*p_isqrtRT*p_isqrtRT-q6M; \ + *(q7B) = 2.f*p_q1bar*intfy*intfz*p_isqrtRT*p_isqrtRT-q7M; \ + *(q8B) = 2.f*p_q1bar*intfx*intfx*p_isqrtRT*p_isqrtRT*p_invsqrt2-q8M; \ + *(q9B) = 2.f*p_q1bar*intfy*intfy*p_isqrtRT*p_isqrtRT*p_invsqrt2-q9M; \ + *(q10B) = 2.f*p_q1bar*intfz*intfz*p_isqrtRT*p_isqrtRT*p_invsqrt2-q10M; \ + } \ +} + +// Boundary conditions +/* wall 1, inflow 2, outflow 3, x-slip 4, y-slip 5 */ +#define boundaryConditions3D(bc, t, x, y, z, nx, ny, nz, intfx, intfy, intfz, q1M, q2M, q3M, q4M, q5M, q6M, q7M, q8M, q9M, q10M, q1B, q2B, q3B, q4B, q5B, q6B, q7B, q8B, q9B, q10B) \ +{ \ + if(bc==1){ \ + *(q1B) = q1M; \ + *(q2B) = -q2M; \ + *(q3B) = -q3M; \ + *(q4B) = -q4M; \ + *(q5B) = q5M; \ + *(q6B) = q6M; \ + *(q7B) = q7M; \ + *(q8B) = q8M; \ + *(q9B) = q9M; \ + *(q10B) = q10M; \ + } else if(bc==2 || bc==3){ \ + *(q1B) = 2.f*p_q1bar - q1M; \ + *(q2B) = 2.f*p_q1bar*intfx*p_isqrtRT - q2M; \ + *(q3B) = 2.f*p_q1bar*intfy*p_isqrtRT - q3M; \ + *(q4B) = 2.f*p_q1bar*intfz*p_isqrtRT - q4M; \ + *(q5B) = 2.f*p_q1bar*intfx*intfy*p_isqrtRT*p_isqrtRT-q5M; \ + *(q6B) = 2.f*p_q1bar*intfx*intfz*p_isqrtRT*p_isqrtRT-q6M; \ + *(q7B) = 2.f*p_q1bar*intfy*intfz*p_isqrtRT*p_isqrtRT-q7M; \ + *(q8B) = 2.f*p_q1bar*intfx*intfx*p_isqrtRT*p_isqrtRT*p_invsqrt2-q8M; \ + *(q9B) = 2.f*p_q1bar*intfy*intfy*p_isqrtRT*p_isqrtRT*p_invsqrt2-q9M; \ + *(q10B) = 2.f*p_q1bar*intfz*intfz*p_isqrtRT*p_isqrtRT*p_invsqrt2-q10M; \ + } else if(bc==4||bc==5){ \ + *(q1B) = q1M; \ + *(q2B) = q2M-2.f*(nx*q2M+ny*q3M*nz*q4M)*nx;\ + *(q3B) = q3M-2.f*(nx*q2M+ny*q3M*nz*q4M)*ny;\ + *(q4B) = q4M-2.f*(nx*q2M+ny*q3M*nz*q4M)*nz;\ + *(q5B) = q5M; \ + *(q6B) = q6M; \ + *(q7B) = q7M; \ + *(q8B) = q8M; \ + *(q9B) = q9M; \ + *(q10B) = q10M; \ + } \ +} diff --git a/tests/ogs/bns/results.bns b/tests/ogs/bns/results.bns new file mode 100644 index 000000000..1960c60e1 --- /dev/null +++ b/tests/ogs/bns/results.bns @@ -0,0 +1,65 @@ + +BNS+TET+Serial: fails because bnsIsoSurface3D.okl uses occaAtomicAdd + +0 0.996512 1.00399 (time,min(density),max(density) + time = 0.0217286 (99), dt = 0.000300669 accepted (ratio dt/hmin = 0.0345185) 0.0217286 0.9589 1.03881 (time,min(density),max(density) + time = 0.0451057 (199), dt = 0.000185443 accepted (ratio dt/hmin = 0.0212899) 0.0451057 0.947236 1.05515 (time,min(density),max(density) + +BNS+TET+MPI(2) + +==16517== Conditional jump or move depends on uninitialised value(s) +==16517== at 0x50211E3: occa::memory::setMHandle(occa::memory_v*) (in /home/tcew/Work/git/occa.NC/lib/libocca.so) +==16517== by 0x502325B: occa::memory::operator=(occa::memory const&) (in /home/tcew/Work/git/occa.NC/lib/libocca.so) +==16517== by 0x41BCEC: bnsSetup(mesh_t*, setupAide&) (bnsSetup.c:532) +==16517== by 0x40CEB1: main (bnsMain.c:67) +==16517== +==16517== Conditional jump or move depends on uninitialised value(s) +==16517== at 0x502116E: occa::memory::removeMHandleRef() (in /home/tcew/Work/git/occa.NC/lib/libocca.so) +==16517== by 0x50211F5: occa::memory::setMHandle(occa::memory_v*) (in /home/tcew/Work/git/occa.NC/lib/libocca.so) +==16517== by 0x502325B: occa::memory::operator=(occa::memory const&) (in /home/tcew/Work/git/occa.NC/lib/libocca.so) +==16517== by 0x41BCEC: bnsSetup(mesh_t*, setupAide&) (bnsSetup.c:532) +==16517== by 0x40CEB1: main (bnsMain.c:67) +==16517== +==16517== Use of uninitialised value of size 8 +==16517== at 0x4F64CB0: occa::withRefs::removeRef() (in /home/tcew/Work/git/occa.NC/lib/libocca.so) +==16517== by 0x5021178: occa::memory::removeMHandleRef() (in /home/tcew/Work/git/occa.NC/lib/libocca.so) +==16517== by 0x50211F5: occa::memory::setMHandle(occa::memory_v*) (in /home/tcew/Work/git/occa.NC/lib/libocca.so) +==16517== by 0x502325B: occa::memory::operator=(occa::memory const&) (in /home/tcew/Work/git/occa.NC/lib/libocca.so) +==16517== by 0x41BCEC: bnsSetup(mesh_t*, setupAide&) (bnsSetup.c:532) +==16517== by 0x40CEB1: main (bnsMain.c:67) +==16517== +==16517== Invalid read of size 4 +==16517== at 0x4F64CB0: occa::withRefs::removeRef() (in /home/tcew/Work/git/occa.NC/lib/libocca.so) +==16517== by 0x5021178: occa::memory::removeMHandleRef() (in /home/tcew/Work/git/occa.NC/lib/libocca.so) +==16517== by 0x50211F5: occa::memory::setMHandle(occa::memory_v*) (in /home/tcew/Work/git/occa.NC/lib/libocca.so) +==16517== by 0x502325B: occa::memory::operator=(occa::memory const&) (in /home/tcew/Work/git/occa.NC/lib/libocca.so) +==16517== by 0x41BCEC: bnsSetup(mesh_t*, setupAide&) (bnsSetup.c:532) +==16517== by 0x40CEB1: main (bnsMain.c:67) +==16517== Address 0x40139e47b7fe793d is not stack'd, malloc'd or (recently) free'd +==16517== + +Culprint: + +bns->o_isoGLvalues = (occa::memory *) malloc(bns->isoGNgroups*sizeof(occa::memory)); + for (int gr =0;grisoGNgroups;gr++) + bns->o_isoGLvalues[gr] = mesh->device.malloc(bns->isoGNlevels[gr]*sizeof(dfloat),bns->isoGLvalues[gr]); + + + +[twnvws1:16517] *** Process received signal *** +[twnvws1:16517] Signal: Segmentation fault (11) +[twnvws1:16517] Signal code: (128) +[twnvws1:16517] Failing at address: (nil) +Loading cached [put] from [/home/tcew/Work/git/libparanumal.ogs/solvers/bns/../../okl/put.okl] in [08c05df2c4c6fa95/binary] +Loading cached [bnsDotMultiply] from [/home/tcew/Work/git/libparanumal.ogs/solvers/bns/okl/bnsDotMultiply.okl] in [c4f1414560622374/binary] +[twnvws1:16517] [ 0] /lib/x86_64-linux-gnu/libc.so.6(+0x354b0)[0x62994b0] +[twnvws1:16517] [ 1] /home/tcew/Work/git/occa.NC/lib/libocca.so(_ZN4occa8withRefs9removeRefEv+0x0)[0x4f64cb0] +[twnvws1:16517] [ 2] /home/tcew/Work/git/occa.NC/lib/libocca.so(_ZN4occa6memory16removeMHandleRefEv+0x29)[0x5021179] +[twnvws1:16517] [ 3] /home/tcew/Work/git/occa.NC/lib/libocca.so(_ZN4occa6memory10setMHandleEPNS_8memory_vE+0x16)[0x50211f6] +[twnvws1:16517] [ 4] /home/tcew/Work/git/occa.NC/lib/libocca.so(_ZN4occa6memoryaSERKS0_+0xc)[0x502325c] +[twnvws1:16517] [ 5] ../../../solvers/bns/bnsMain[0x41bced] +[twnvws1:16517] [ 6] ../../../solvers/bns/bnsMain[0x40ceb2] +[twnvws1:16517] [ 7] /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf0)[0x6284830] +[twnvws1:16517] [ 8] ../../../solvers/bns/bnsMain[0x40edc9] +[twnvws1:16517] *** End of error message *** + diff --git a/tests/ogs/bns/results.text b/tests/ogs/bns/results.text new file mode 100644 index 000000000..1960c60e1 --- /dev/null +++ b/tests/ogs/bns/results.text @@ -0,0 +1,65 @@ + +BNS+TET+Serial: fails because bnsIsoSurface3D.okl uses occaAtomicAdd + +0 0.996512 1.00399 (time,min(density),max(density) + time = 0.0217286 (99), dt = 0.000300669 accepted (ratio dt/hmin = 0.0345185) 0.0217286 0.9589 1.03881 (time,min(density),max(density) + time = 0.0451057 (199), dt = 0.000185443 accepted (ratio dt/hmin = 0.0212899) 0.0451057 0.947236 1.05515 (time,min(density),max(density) + +BNS+TET+MPI(2) + +==16517== Conditional jump or move depends on uninitialised value(s) +==16517== at 0x50211E3: occa::memory::setMHandle(occa::memory_v*) (in /home/tcew/Work/git/occa.NC/lib/libocca.so) +==16517== by 0x502325B: occa::memory::operator=(occa::memory const&) (in /home/tcew/Work/git/occa.NC/lib/libocca.so) +==16517== by 0x41BCEC: bnsSetup(mesh_t*, setupAide&) (bnsSetup.c:532) +==16517== by 0x40CEB1: main (bnsMain.c:67) +==16517== +==16517== Conditional jump or move depends on uninitialised value(s) +==16517== at 0x502116E: occa::memory::removeMHandleRef() (in /home/tcew/Work/git/occa.NC/lib/libocca.so) +==16517== by 0x50211F5: occa::memory::setMHandle(occa::memory_v*) (in /home/tcew/Work/git/occa.NC/lib/libocca.so) +==16517== by 0x502325B: occa::memory::operator=(occa::memory const&) (in /home/tcew/Work/git/occa.NC/lib/libocca.so) +==16517== by 0x41BCEC: bnsSetup(mesh_t*, setupAide&) (bnsSetup.c:532) +==16517== by 0x40CEB1: main (bnsMain.c:67) +==16517== +==16517== Use of uninitialised value of size 8 +==16517== at 0x4F64CB0: occa::withRefs::removeRef() (in /home/tcew/Work/git/occa.NC/lib/libocca.so) +==16517== by 0x5021178: occa::memory::removeMHandleRef() (in /home/tcew/Work/git/occa.NC/lib/libocca.so) +==16517== by 0x50211F5: occa::memory::setMHandle(occa::memory_v*) (in /home/tcew/Work/git/occa.NC/lib/libocca.so) +==16517== by 0x502325B: occa::memory::operator=(occa::memory const&) (in /home/tcew/Work/git/occa.NC/lib/libocca.so) +==16517== by 0x41BCEC: bnsSetup(mesh_t*, setupAide&) (bnsSetup.c:532) +==16517== by 0x40CEB1: main (bnsMain.c:67) +==16517== +==16517== Invalid read of size 4 +==16517== at 0x4F64CB0: occa::withRefs::removeRef() (in /home/tcew/Work/git/occa.NC/lib/libocca.so) +==16517== by 0x5021178: occa::memory::removeMHandleRef() (in /home/tcew/Work/git/occa.NC/lib/libocca.so) +==16517== by 0x50211F5: occa::memory::setMHandle(occa::memory_v*) (in /home/tcew/Work/git/occa.NC/lib/libocca.so) +==16517== by 0x502325B: occa::memory::operator=(occa::memory const&) (in /home/tcew/Work/git/occa.NC/lib/libocca.so) +==16517== by 0x41BCEC: bnsSetup(mesh_t*, setupAide&) (bnsSetup.c:532) +==16517== by 0x40CEB1: main (bnsMain.c:67) +==16517== Address 0x40139e47b7fe793d is not stack'd, malloc'd or (recently) free'd +==16517== + +Culprint: + +bns->o_isoGLvalues = (occa::memory *) malloc(bns->isoGNgroups*sizeof(occa::memory)); + for (int gr =0;grisoGNgroups;gr++) + bns->o_isoGLvalues[gr] = mesh->device.malloc(bns->isoGNlevels[gr]*sizeof(dfloat),bns->isoGLvalues[gr]); + + + +[twnvws1:16517] *** Process received signal *** +[twnvws1:16517] Signal: Segmentation fault (11) +[twnvws1:16517] Signal code: (128) +[twnvws1:16517] Failing at address: (nil) +Loading cached [put] from [/home/tcew/Work/git/libparanumal.ogs/solvers/bns/../../okl/put.okl] in [08c05df2c4c6fa95/binary] +Loading cached [bnsDotMultiply] from [/home/tcew/Work/git/libparanumal.ogs/solvers/bns/okl/bnsDotMultiply.okl] in [c4f1414560622374/binary] +[twnvws1:16517] [ 0] /lib/x86_64-linux-gnu/libc.so.6(+0x354b0)[0x62994b0] +[twnvws1:16517] [ 1] /home/tcew/Work/git/occa.NC/lib/libocca.so(_ZN4occa8withRefs9removeRefEv+0x0)[0x4f64cb0] +[twnvws1:16517] [ 2] /home/tcew/Work/git/occa.NC/lib/libocca.so(_ZN4occa6memory16removeMHandleRefEv+0x29)[0x5021179] +[twnvws1:16517] [ 3] /home/tcew/Work/git/occa.NC/lib/libocca.so(_ZN4occa6memory10setMHandleEPNS_8memory_vE+0x16)[0x50211f6] +[twnvws1:16517] [ 4] /home/tcew/Work/git/occa.NC/lib/libocca.so(_ZN4occa6memoryaSERKS0_+0xc)[0x502325c] +[twnvws1:16517] [ 5] ../../../solvers/bns/bnsMain[0x41bced] +[twnvws1:16517] [ 6] ../../../solvers/bns/bnsMain[0x40ceb2] +[twnvws1:16517] [ 7] /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf0)[0x6284830] +[twnvws1:16517] [ 8] ../../../solvers/bns/bnsMain[0x40edc9] +[twnvws1:16517] *** End of error message *** + diff --git a/tests/ogs/bns/setupBnsTet3D.rc b/tests/ogs/bns/setupBnsTet3D.rc new file mode 100644 index 000000000..e5cebc7fd --- /dev/null +++ b/tests/ogs/bns/setupBnsTet3D.rc @@ -0,0 +1,159 @@ +[FORMAT] +1.0 + +[DATA FILE] +data/bnsUniform3D.h + +[MESH FILE] +../../../meshes/boltzmannSquareCylinderPML3D.msh + +[MESH DIMENSION] +3 + +[ELEMENT TYPE] # number of edges +6 + +[POLYNOMIAL DEGREE] +3 + +[RBAR] # mean density +1.0 + +[UBAR] # mean x-velocity +1.0 + +[VBAR] # mean y-velocity +0.0 + +[WBAR] # mean z-velocity +0.0 + +[BODYFORCE-X] # constant global force on particles in x-direction +0.0 + +[BODYFORCE-Y] # constant global force on particles in y-direction +0.0 + +[BODYFORCE-Z] # constant global force on particles in z-direction +0.0 + +[ABSORBING LAYER] +PML + +[PML PROFILE ORDER] +4 + +[PML SIGMAX MAX] +100 + +[PML SIGMAY MAX] +100 + +[PML SIGMAZ MAX] +100 + +# compute sigma terms default: CUBATURE use COLLOCATION otherwise +[PML INTEGRATION] +COLLOCATION + +[THREAD MODEL] +CUDA + +[PLATFORM NUMBER] +0 + +[DEVICE NUMBER] +0 + +[TIME INTEGRATOR] +#LSERK4 +#MRSAAB +SARK + +[FIXED TIME STEP] +0 + +[VISCOSITY] +1.e-3 + +[SPEED OF SOUND] +5.0 + +[PROBE FLAG] +0 + +[REPORT FLAG] +1 + +[ERROR FLAG] +1 + +[TSTEPS FOR ERROR COMPUTE] +100 + +[TSTEPS FOR SOLUTION OUTPUT] +100 + +[TSTEPS FOR FORCE OUTPUT] +0 + + + +[START TIME] +0 + +[FINAL TIME] +50 + +[OUTPUT INTERVAL] +.1 + +[ABSOLUTE TOLERANCE] +1E-6 + +[RELATIVE TOLERANCE] +1E-5 + +[MINUMUM TIME STEP SIZE] +1E-10 + +[MAX MRAB LEVELS] +100 + +[CFL] +0.2 + +# Currently SARK and LSERK only +[RESTART FROM FILE] +0 + +[WRITE RESTART FILE] +0 + +[RESTART FILE NAME] +bnsRestartTet3D + +[OUTPUT FILE FORMAT] #ISO_WELD - ISO_FULL - VTU +ISO_WELD + +#0 = pr, 1,2,3 = u,v,w 4,5,6 = vortx,vorty,vortz, 7= vort_mag 8= Vel mag +[ISOSURFACE FIELD ID] +7 + +[ISOSURFACE COLOR ID] +8 + +[ISOSURFACE CONTOUR MAX] +2.0 + +[ISOSURFACE CONTOUR MIN] +0.4 + +[ISOSURFACE LEVEL NUMBER] +5 + +[ISOSURFACE GROUP NUMBER] +5 + +[OUTPUT FILE NAME] +fence3D diff --git a/tests/ogs/ins/data/insBeltrami3D.h b/tests/ogs/ins/data/insBeltrami3D.h new file mode 100644 index 000000000..1e0c6ece2 --- /dev/null +++ b/tests/ogs/ins/data/insBeltrami3D.h @@ -0,0 +1,130 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + + +// Initial conditions +#define insFlowField3D(t,x,y,z, u,v,w,p) \ + { \ + dfloat a = M_PI/4.f; \ + dfloat d = M_PI/2.f; \ + *(u) = -a*(exp(a*x)*sin(a*y+d*z)+exp(a*z)*cos(a*x+d*y))*exp(-d*d*t);\ + *(v) = -a*(exp(a*y)*sin(a*z+d*x)+exp(a*x)*cos(a*y+d*z))*exp(-d*d*t);\ + *(w) = -a*(exp(a*z)*sin(a*x+d*y)+exp(a*y)*cos(a*z+d*x))*exp(-d*d*t);\ + *(p) = -a*a*exp(-2.f*d*d*t)*(exp(2.f*a*x)+exp(2.f*a*y)+exp(2.f*a*z))*(sin(a*x+d*y)*cos(a*z+d*x)*exp(a*(y+z))+sin(a*y+d*z)*cos(a*x+d*y)*exp(a*(x+z))+sin(a*z+d*x)*cos(a*y+d*z)*exp(a*(x+y))); \ + } + +// Boundary conditions +/* wall 1, inflow 2, outflow 3, x-slip 4, y-slip 5, z-slip 6 */ +#define insVelocityDirichletConditions3D(bc, t, x, y, z, nx, ny, nz, uM, vM, wM, uB, vB, wB) \ +{ \ + dfloat a = M_PI/4.f; \ + dfloat d = M_PI/2.f; \ + if(bc==1){ \ + *(uB) = 0.f; \ + *(vB) = 0.f; \ + *(wB) = 0.f; \ + } else if(bc==2){ \ + *(uB) = -a*(exp(a*x)*sin(a*y+d*z)+exp(a*z)*cos(a*x+d*y))*exp(-d*d*t);\ + *(vB) = -a*(exp(a*y)*sin(a*z+d*x)+exp(a*x)*cos(a*y+d*z))*exp(-d*d*t);\ + *(wB) = -a*(exp(a*z)*sin(a*x+d*y)+exp(a*y)*cos(a*z+d*x))*exp(-d*d*t);\ + } else if(bc==3){ \ + *(uB) = uM; \ + *(vB) = vM; \ + *(wB) = wM; \ + } else if(bc==4||bc==5||bc==6){ \ + *(uB) = uM - (nx*uM+ny*vM+nz*wM)*nx;\ + *(vB) = vM - (nx*uM+ny*vM+nz*wM)*ny;\ + *(wB) = wM - (nx*uM+ny*vM+nz*wM)*nz;\ + } \ +} + +#define insVelocityNeumannConditions3D(bc, t, x, y, z, nx, ny, nz, uxM, uyM, uzM, vxM, vyM, vzM, wxM, wyM, wzM, uxB, uyB, uzB, vxB, vyB, vzB, wxB, wyB, wzB) \ +{ \ + dfloat a = M_PI/4.f; \ + dfloat d = M_PI/2.f; \ + if(bc==1 || bc==2){ \ + *(uxB) = uxM; \ + *(uyB) = uyM; \ + *(uzB) = uzM; \ + *(vxB) = vxM; \ + *(vyB) = vyM; \ + *(vzB) = vzM; \ + *(wxB) = wxM; \ + *(wyB) = wyM; \ + *(wzB) = wzM; \ + } else if(bc==3){ \ + *(uxB) = -a*(a*exp(a*x)*sin(a*y+d*z)-a*exp(a*z)*sin(a*x+d*y))*exp(-d*d*t); \ + *(uyB) = -a*(a*exp(a*x)*cos(a*y+d*z)-d*exp(a*z)*sin(a*x+d*y))*exp(-d*d*t); \ + *(uzB) = -a*(d*exp(a*x)*cos(a*y+d*z)+a*exp(a*z)*cos(a*x+d*y))*exp(-d*d*t); \ + *(vxB) = -a*(d*exp(a*y)*cos(a*z+d*x)+a*exp(a*x)*cos(a*y+d*z))*exp(-d*d*t); \ + *(vyB) = -a*(a*exp(a*y)*sin(a*z+d*x)-a*exp(a*x)*sin(a*y+d*z))*exp(-d*d*t); \ + *(vzB) = -a*(a*exp(a*y)*cos(a*z+d*x)-d*exp(a*x)*sin(a*y+d*z))*exp(-d*d*t); \ + *(wxB) = a*(a*exp(a*z)*cos(a*x+d*y)-d*exp(a*y)*sin(a*z+d*x))*exp(-d*d*t); \ + *(wyB) = a*(d*exp(a*z)*cos(a*x+d*y)+a*exp(a*y)*cos(a*z+d*x))*exp(-d*d*t); \ + *(wzB) = a*(a*exp(a*z)*sin(a*x+d*y)-a*exp(a*y)*sin(a*z+d*x))*exp(-d*d*t); \ + } else if(bc==4||bc==5||bc==6){ \ + *(uxB) = nx*nx*uxM; \ + *(uyB) = nx*nx*uyM; \ + *(uzB) = nx*nx*uzM; \ + *(vxB) = ny*ny*vxM; \ + *(vyB) = ny*ny*vyM; \ + *(vzB) = ny*ny*vzM; \ + *(wxB) = nz*nz*wxM; \ + *(wyB) = nz*nz*wyM; \ + *(wzB) = nz*nz*wzM; \ + } \ +} + + +#define insPressureDirichletConditions3D(bc, t, x, y, z, nx, ny, nz, pM, pB) \ +{ \ + dfloat a = M_PI/4.f; \ + dfloat d = M_PI/2.f; \ + if(bc==1 || bc==2){ \ + *(pB) = pM; \ + } else if(bc==3){ \ + *(pB) = -a*a*exp(-2.f*d*d*t)*( exp(2.f*a*x)+exp(2.f*a*y)+exp(2.f*a*z))*(sin(a*x+d*y)*cos(a*z+d*x)*exp(a*(y+z))+sin(a*y+d*z)*cos(a*x+d*y)*exp(a*(x+z))+sin(a*z+d*x)*cos(a*y+d*z)*exp(a*(x+y))); \ + } else if(bc==4||bc==5||bc==6){ \ + *(pB) = pM; \ + } \ +} + +#define insPressureNeumannConditions3D(bc, t, x, y, z, nx, ny, nz, pxM, pyM, pzM, pxB, pyB, pzB) \ +{ \ + if(bc==1 || bc==2){ \ + *(pxB) = 0.f; \ + *(pyB) = 0.f; \ + *(pzB) = 0.f; \ + } else if(bc==3){ \ + *(pxB) = pxM; \ + *(pyB) = pyM; \ + *(pzB) = pzM; \ + } else if(bc==4||bc==5||bc==6){ \ + *(pxB) = 0.f; \ + *(pyB) = 0.f; \ + *(pzB) = 0.f; \ + } \ +} diff --git a/tests/ogs/ins/data/insKovasznay2D.h b/tests/ogs/ins/data/insKovasznay2D.h new file mode 100644 index 000000000..419384e31 --- /dev/null +++ b/tests/ogs/ins/data/insKovasznay2D.h @@ -0,0 +1,140 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +/* wall 1, inflow 2, outflow 3 */ + +// Weakly Impose Nonlinear term BCs +#define insAdvectionBoundaryConditions2D(bc, t, x, y, nx, ny, uM, vM, uB, vB) \ + { \ + dfloat lambda = 1.f/(2.f * p_nu) - occaSqrt(1.f/(4.f*p_nu*p_nu) + 4.f*M_PI*M_PI);\ + if(bc==1){ \ + *(uB) = 0.f; \ + *(vB) = 0.f; \ + } else if(bc==2){ \ + *(uB) = 1.f - exp(lambda*x)*cos(2.f*M_PI*y); \ + *(vB) = lambda/(2.f*M_PI)*exp(lambda*x)*sin(2.f*M_PI*y); \ + } else if(bc==3){ \ + *(uB) = uM; \ + *(vB) = vM; \ + } \ + } + +#define insDivergenceBoundaryConditions2D(bc, t, x, y, nx, ny, uM, vM, uB, vB) \ + { \ + dfloat lambda = 1.f/(2.f * p_nu) - occaSqrt(1.f/(4.f*p_nu*p_nu) + 4.f*M_PI*M_PI);\ + if(bc==1){ \ + *(uB)= 0.f; \ + *(vB)= 0.f; \ + } else if(bc==2){ \ + *(uB) = 1.f - exp(lambda*x)*cos(2.f*M_PI*y); \ + *(vB) = lambda/(2.f*M_PI)*exp(lambda*x)*sin(2.f*M_PI*y);\ + } else if(bc==3){ \ + *(uB) = uM; \ + *(vB) = vM; \ + } \ + } + +// Gradient only applies to Pressure and Pressure Incremament +// Boundary Conditions are implemented in strong form +#define insGradientBoundaryConditions2D(bc,t,x,y,nx,ny,pM,pB) \ + { \ + dfloat lambda = 1.f/(2.f * p_nu) - occaSqrt(1.f/(4.f*p_nu*p_nu) + 4.f*M_PI*M_PI);\ + if(bc==1){ \ + *(pB) = pM; \ + } else if(bc==2){ \ + *(pB) = pM; \ + } else if(bc==3){ \ + *(pB) = 0.5f*(1.f- exp(2.f*lambda*x));\ + } \ + } + +#define insHelmholtzBoundaryConditionsIpdg2D(bc, t, x, y, nx, ny, uB, uxB, uyB, vB, vxB, vyB) \ + { \ + dfloat lambda = 1.f/(2.f*p_nu)-occaSqrt(1.f/(4.f*p_nu*p_nu) + 4.f*M_PI*M_PI);\ + if((bc==1)||(bc==4)){ \ + *(uB) = 0.f; \ + *(vB) = 0.f; \ + \ + *(uxB) = 0.f; \ + *(uyB) = 0.f; \ + *(vxB) = 0.f; \ + *(vyB) = 0.f; \ + } else if(bc==2){ \ + \ + *(uB) = 1.f - exp(lambda*x)*cos(2.f*M_PI*y); \ + *(vB) = lambda/(2.f*M_PI)*exp(lambda*x)*sin(2.f*M_PI*y); \ + \ + *(uxB) = 0.f; \ + *(uyB) = 0.f; \ + *(vxB) = 0.f; \ + *(vyB) = 0.f; \ + } else if(bc==3){ \ + *(uB) = 0.f; \ + *(vB) = 0.f; \ + *(uxB) = -lambda*exp(lambda*x)*cos(2.f*M_PI*y);\ + *(uyB) = 2.f*M_PI*exp(lambda*x)*sin(2.f*M_PI*y); \ + *(vxB) = lambda*lambda/(2.f*M_PI)*exp(lambda*x)*sin(2.f*M_PI*y); \ + *(vyB) = lambda*exp(lambda*x)*cos(2.f*M_PI*y); \ + } \ + } + + +// Compute bcs for P increment: if c0 = 0 give Pr BCs, zero if time independent +#define insPoissonBoundaryConditions2D(bc,t,dt,x,y,nx,ny,pB,pxB,pyB) \ + { \ + dfloat lambda = 1.f/(2.f * p_nu) - occaSqrt(1.f/(4.f*p_nu*p_nu) + 4.f*M_PI*M_PI);\ + if((bc==1)||(bc==4)){ \ + *(pB) = 0.f; \ + \ + *(pxB) = 0.f; \ + *(pyB) = 0.f; \ + } \ + if(bc==2){ \ + *(pB) = 0.f; \ + \ + *(pxB) = 0.f; \ + *(pyB) = 0.f; \ + } \ + if(bc==3){ \ + *(pB) = 0.5f*(1.f- exp(2.f*lambda*x));\ + \ + *(pxB) = 0.f; \ + *(pyB) = 0.f; \ + } \ + } + + + +// Compute bcs for P increment +#define insPoissonNeumannTimeDerivative2D(bc,t,x,y,dpdt) \ + { \ + if((bc==1)||(bc==4)||(bc==2) ){ \ + *(dpdt) =0.f; \ + } \ + if(bc==3){ \ + *(dpdt) = 0.0; \ + } \ + } diff --git a/tests/ogs/ins/data/insUniform2D.h b/tests/ogs/ins/data/insUniform2D.h new file mode 100644 index 000000000..e81a57dff --- /dev/null +++ b/tests/ogs/ins/data/insUniform2D.h @@ -0,0 +1,112 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + + +// Initial conditions +#define insFlowField2D(t,x,y,u,v,p) \ + { \ + *(u) = p_ubar; \ + *(v) = p_vbar; \ + *(p) = p_pbar; \ + } + +// Boundary conditions +/* wall 1, inflow 2, outflow 3, x-slip 4, y-slip 5 */ +#define insVelocityDirichletConditions2D(bc, t, x, y, nx, ny, uM, vM, uB, vB) \ +{ \ + if(bc==1){ \ + *(uB) = 0.f; \ + *(vB) = 0.f; \ + } else if(bc==2){ \ + *(uB) = p_ubar; \ + *(vB) = p_vbar; \ + } else if(bc==3){ \ + *(uB) = uM; \ + *(vB) = vM; \ + } else if(bc==4){ \ + *(uB) = 0.f; \ + *(vB) = vM; \ + } else if(bc==5){ \ + *(uB) = uM; \ + *(vB) = 0.f; \ + } \ +} + +#define insVelocityNeumannConditions2D(bc, t, x, y, nx, ny, uxM, uyM, vxM, vyM, uxB, uyB, vxB, vyB) \ +{ \ + if(bc==1 || bc==2){ \ + *(uxB) = uxM; \ + *(uyB) = uyM; \ + *(vxB) = vxM; \ + *(vyB) = vyM; \ + } else if(bc==3){ \ + *(uxB) = 0.f; \ + *(uyB) = 0.f; \ + *(vxB) = 0.f; \ + *(vyB) = 0.f; \ + } else if(bc==4){ \ + *(uxB) = uxM; \ + *(uyB) = uyM; \ + *(vxB) = 0.f; \ + *(vyB) = 0.f; \ + } else if(bc==5){ \ + *(uxB) = 0.f; \ + *(uyB) = 0.f; \ + *(vxB) = vxM; \ + *(vyB) = vyM; \ + } \ +} + + +#define insPressureDirichletConditions2D(bc, t, x, y, nx, ny, pM, pB) \ +{ \ + if(bc==1 || bc==2){ \ + *(pB) = pM; \ + } else if(bc==3){ \ + *(pB) = p_pbar; \ + } else if(bc==4){ \ + *(pB) = pM; \ + } else if(bc==5){ \ + *(pB) = pM; \ + } \ +} + +#define insPressureNeumannConditions2D(bc, t, x, y, nx, ny, pxM, pyM, pxB, pyB) \ +{ \ + if(bc==1 || bc==2){ \ + *(pxB) = 0.f; \ + *(pyB) = 0.f; \ + } else if(bc==3){ \ + *(pxB) = pxM; \ + *(pyB) = pyM; \ + } else if(bc==4){ \ + *(pxB) = 0.f; \ + *(pyB) = 0.f; \ + } else if(bc==5){ \ + *(pxB) = 0.f; \ + *(pyB) = 0.f; \ + } \ +} diff --git a/tests/ogs/ins/data/insUniform3D.h b/tests/ogs/ins/data/insUniform3D.h new file mode 100644 index 000000000..c4e61370a --- /dev/null +++ b/tests/ogs/ins/data/insUniform3D.h @@ -0,0 +1,122 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + + +// Initial conditions +#define insFlowField3D(t,x,y,z, u,v,w,p) \ + { \ + *(u) = p_ubar; \ + *(v) = p_vbar; \ + *(w) = p_wbar; \ + *(p) = p_pbar; \ + } + +// Boundary conditions +/* wall 1, inflow 2, outflow 3, x-slip 4, y-slip 5, z-slip 6 */ +#define insVelocityDirichletConditions3D(bc, t, x, y, z, nx, ny, nz, uM, vM, wM, uB, vB, wB) \ +{ \ + if(bc==1){ \ + *(uB) = 0.f; \ + *(vB) = 0.f; \ + *(wB) = 0.f; \ + } else if(bc==2){ \ + *(uB) = p_ubar; \ + *(vB) = p_vbar; \ + *(wB) = p_wbar; \ + } else if(bc==3){ \ + *(uB) = uM; \ + *(vB) = vM; \ + *(wB) = wM; \ + } else if(bc==4||bc==5||bc==6){ \ + *(uB) = uM - (nx*uM+ny*vM+nz*wM)*nx;\ + *(vB) = vM - (nx*uM+ny*vM+nz*wM)*ny;\ + *(wB) = wM - (nx*uM+ny*vM+nz*wM)*nz;\ + } \ +} + +#define insVelocityNeumannConditions3D(bc, t, x, y, z, nx, ny, nz, uxM, uyM, uzM, vxM, vyM, vzM, wxM, wyM, wzM, uxB, uyB, uzB, vxB, vyB, vzB, wxB, wyB, wzB) \ +{ \ + if(bc==1 || bc==2){ \ + *(uxB) = uxM; \ + *(uyB) = uyM; \ + *(uzB) = uzM; \ + *(vxB) = vxM; \ + *(vyB) = vyM; \ + *(vzB) = vzM; \ + *(wxB) = wxM; \ + *(wyB) = wyM; \ + *(wzB) = wzM; \ + } else if(bc==3){ \ + *(uxB) = 0.f; \ + *(uyB) = 0.f; \ + *(uzB) = 0.f; \ + *(vxB) = 0.f; \ + *(vyB) = 0.f; \ + *(vzB) = 0.f; \ + *(wxB) = 0.f; \ + *(wyB) = 0.f; \ + *(wzB) = 0.f; \ + } else if(bc==4||bc==5||bc==6){ \ + *(uxB) = nx*nx*uxM; \ + *(uyB) = nx*nx*uyM; \ + *(uzB) = nx*nx*uzM; \ + *(vxB) = ny*ny*vxM; \ + *(vyB) = ny*ny*vyM; \ + *(vzB) = ny*ny*vzM; \ + *(wxB) = nz*nz*wxM; \ + *(wyB) = nz*nz*wyM; \ + *(wzB) = nz*nz*wzM; \ + } \ +} + + +#define insPressureDirichletConditions3D(bc, t, x, y, z, nx, ny, nz, pM, pB) \ +{ \ + if(bc==1 || bc==2){ \ + *(pB) = pM; \ + } else if(bc==3){ \ + *(pB) = p_pbar; \ + } else if(bc==4||bc==5||bc==6){ \ + *(pB) = pM; \ + } \ +} + +#define insPressureNeumannConditions3D(bc, t, x, y, z, nx, ny, nz, pxM, pyM, pzM, pxB, pyB, pzB) \ +{ \ + if(bc==1 || bc==2){ \ + *(pxB) = 0.f; \ + *(pyB) = 0.f; \ + *(pzB) = 0.f; \ + } else if(bc==3){ \ + *(pxB) = pxM; \ + *(pyB) = pyM; \ + *(pzB) = pzM; \ + } else if(bc==4||bc==5||bc==6){ \ + *(pxB) = 0.f; \ + *(pyB) = 0.f; \ + *(pzB) = 0.f; \ + } \ +} diff --git a/tests/ogs/ins/data/insVortex2D.h b/tests/ogs/ins/data/insVortex2D.h new file mode 100644 index 000000000..eb4e0d599 --- /dev/null +++ b/tests/ogs/ins/data/insVortex2D.h @@ -0,0 +1,111 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +// Initial conditions +#define insFlowField2D(t,x,y,u,v,p) \ + { \ + *(u) = -sin(2.f*M_PI*y)*exp(-p_nu*4.f*M_PI*M_PI*t);\ + *(v) = sin(2.f*M_PI*x)*exp(-p_nu*4.f*M_PI*M_PI*t);\ + *(p) = -cos(2.f*M_PI*y)*cos(2.f*M_PI*x)*exp(-p_nu*8.f*M_PI*M_PI*t); \ + } + +// Boundary conditions +/* wall 1, inflow 2, outflow 3, x-slip 4, y-slip 5 */ +#define insVelocityDirichletConditions2D(bc, t, x, y, nx, ny, uM, vM, uB, vB) \ +{ \ + if(bc==1){ \ + *(uB) = 0.f; \ + *(vB) = 0.f; \ + } else if(bc==2){ \ + *(uB) = -sin(2.f*M_PI*y)*exp(-p_nu*4.f*M_PI*M_PI*t);\ + *(vB) = sin(2.f*M_PI*x)*exp(-p_nu*4.f*M_PI*M_PI*t);\ + } else if(bc==3){ \ + *(uB) = uM; \ + *(vB) = vM; \ + } else if(bc==4){ \ + *(uB) = 0.f; \ + *(vB) = vM; \ + } else if(bc==5){ \ + *(uB) = uM; \ + *(vB) = 0.f; \ + } \ +} + +#define insVelocityNeumannConditions2D(bc, t, x, y, nx, ny, uxM, uyM, vxM, vyM, uxB, uyB, vxB, vyB) \ +{ \ + if(bc==1 || bc==2){ \ + *(uxB) = uxM; \ + *(uyB) = uyM; \ + *(vxB) = vxM; \ + *(vyB) = vyM; \ + } else if(bc==3){ \ + *(uxB) = 0.f; \ + *(uyB) = -2.f*M_PI*cos(2.f*M_PI*y)*exp(-p_nu*4.f*M_PI*M_PI*t);\ + *(vxB) = 2.f*M_PI*cos(2.f*M_PI*x)*exp(-p_nu*4.f*M_PI*M_PI*t);\ + *(vyB) = 0.f; \ + } else if(bc==4){ \ + *(uxB) = uxM; \ + *(uyB) = uyM; \ + *(vxB) = 0.f; \ + *(vyB) = 0.f; \ + } else if(bc==5){ \ + *(uxB) = 0.f; \ + *(uyB) = 0.f; \ + *(vxB) = vxM; \ + *(vyB) = vyM; \ + } \ +} + + +#define insPressureDirichletConditions2D(bc, t, x, y, nx, ny, pM, pB) \ +{ \ + if(bc==1 || bc==2){ \ + *(pB) = pM; \ + } else if(bc==3){ \ + *(pB) = -cos(2.f*M_PI*y)*cos(2.f*M_PI*x)*exp(-p_nu*8.f*M_PI*M_PI*t);\ + } else if(bc==4){ \ + *(pB) = pM; \ + } else if(bc==5){ \ + *(pB) = pM; \ + } \ +} + +#define insPressureNeumannConditions2D(bc, t, x, y, nx, ny, pxM, pyM, pxB, pyB) \ +{ \ + if(bc==1 || bc==2){ \ + *(pxB) = 2.f*M_PI*cos(2.f*M_PI*y)*sin(2.f*M_PI*x)*exp(-p_nu*8.f*M_PI*M_PI*t);\ + *(pyB) = 2.f*M_PI*sin(2.f*M_PI*y)*cos(2.f*M_PI*x)*exp(-p_nu*8.f*M_PI*M_PI*t);\ + } else if(bc==3){ \ + *(pxB) = pxM; \ + *(pyB) = pyM; \ + } else if(bc==4){ \ + *(pxB) = 0.f; \ + *(pyB) = 0.f; \ + } else if(bc==5){ \ + *(pxB) = 0.f; \ + *(pyB) = 0.f; \ + } \ +} diff --git a/tests/ogs/ins/readme.ins b/tests/ogs/ins/readme.ins new file mode 100644 index 000000000..0c500e258 --- /dev/null +++ b/tests/ogs/ins/readme.ins @@ -0,0 +1,112 @@ +This warning when compiling on pascal compute node; + +/home/tcew/Work/git/occa.NC/scripts/shellTools.sh: line 369: /home/tcew/Work/git/occa.NC/scripts/compilerVendorTest: Text file busy +/home/tcew/Work/git/occa.NC/scripts/shellTools.sh: line 369: /home/tcew/Work/git/occa.NC/scripts/compilerVendorTest: Text file busy + +MPI(1): + +OCCA_VERBOSE=1 ../../../solvers/ins/insMain setupInsTri2D.rc + +Step: 1 Time: 0 minU: 1 maxU: 1 minV: 0 maxV: 0 minP: 0 maxP: 0 +tstep = 100, solver iterations: U - 5, V - 6, P - 5 +Step: 101 Time: 0.899388 minU: 2.98196e-06 maxU: 3.51925 minV: 1.78537e-08 maxV: 1.60021 minP: 0 maxP: 2.09901 + +Writing Binary Restart File....done +tstep = 200, solver iterations: U - 4, V - 6, P - 5 +Step: 201 Time: 1.79878 minU: 7.21785e-05 maxU: 3.38123 minV: 1.51454e-09 maxV: 1.51165 minP: 0 maxP: 1.68119 + +Writing Binary Restart File....done +tstep = 300, solver iterations: U - 4, V - 6, P - 5 +Step: 301 Time: 2.69817 minU: 7.82594e-05 maxU: 3.40896 minV: 4.66123e-10 maxV: 1.46822 minP: 0 maxP: 1.45971 + +Writing Binary Restart File....done +tstep = 400, solver iterations: U - 4, V - 6, P - 5 +Step: 401 Time: 3.59755 minU: 5.54487e-05 maxU: 3.43688 minV: 2.93711e-10 maxV: 1.41623 minP: 0 maxP: 1.49691 + +Writing Binary Restart File....done +tstep = 500, solver iterations: U - 4, V - 6, P - 5 +Step: 501 Time: 4.49694 minU: 1.8506e-05 maxU: 2.68207 minV: 4.62998e-10 maxV: 1.34705 minP: 0 maxP: 1.73217 + +Writing Binary Restart File....done +tstep = 600, solver iterations: U - 4, V - 6, P - 5 +Step: 601 Time: 5.39633 minU: 7.38336e-06 maxU: 3.00247 minV: 2.52416e-09 maxV: 1.3453 minP: 0 maxP: 1.80372 + +Writing Binary Restart File....done +tstep = 700, solver iterations: U - 4, V - 6, P - 5 +Step: 701 Time: 6.29572 minU: 7.09099e-07 maxU: 3.51038 minV: 1.59509e-10 maxV: 1.31237 minP: 0 maxP: 1.81222 + +Writing Binary Restart File....done +tstep = 800, solver iterations: U - 4, V - 6, P - 5 +Step: 801 Time: 7.19511 minU: 2.47566e-06 maxU: 2.84548 minV: 2.24355e-10 maxV: 1.29519 minP: 0 maxP: 1.89712 + + +MPI(2) + +OCCA_VERBOSE=1 mpiexec.mpich -n 2 ../../../solvers/ins/insMain setupInsTri2D.rc + +Step: 1 Time: 0 minU: 1 maxU: 1 minV: 0 maxV: 0 minP: 0 maxP: 0 +tstep = 100, solver iterations: U - 5, V - 6, P - 5 +Step: 101 Time: 0.899388 minU: 2.98196e-06 maxU: 3.51925 minV: 1.7969e-08 maxV: 1.60021 minP: 0 maxP: 2.09901 + +Writing Binary Restart File....done +tstep = 200, solver iterations: U - 4, V - 6, P - 5 +Step: 201 Time: 1.79878 minU: 7.21785e-05 maxU: 3.38123 minV: 1.51231e-09 maxV: 1.51165 minP: 0 maxP: 1.68119 + +Writing Binary Restart File....done +tstep = 300, solver iterations: U - 4, V - 6, P - 5 +Step: 301 Time: 2.69817 minU: 7.82594e-05 maxU: 3.40896 minV: 5.43209e-11 maxV: 1.46822 minP: 0 maxP: 1.45971 + +Writing Binary Restart File....done +tstep = 400, solver iterations: U - 4, V - 6, P - 5 +Step: 401 Time: 3.59755 minU: 5.54486e-05 maxU: 3.43689 minV: 5.8604e-09 maxV: 1.41623 minP: 0 maxP: 1.49691 + +Writing Binary Restart File....done +tstep = 500, solver iterations: U - 4, V - 6, P - 5 +Step: 501 Time: 4.49694 minU: 1.85059e-05 maxU: 2.68207 minV: 4.63447e-10 maxV: 1.34705 minP: 0 maxP: 1.73217 + +Writing Binary Restart File....done +tstep = 600, solver iterations: U - 4, V - 6, P - 5 +Step: 601 Time: 5.39633 minU: 7.38335e-06 maxU: 3.00247 minV: 1.85397e-09 maxV: 1.3453 minP: 0 maxP: 1.80372 + +Writing Binary Restart File....done +tstep = 700, solver iterations: U - 4, V - 6, P - 6 +Step: 701 Time: 6.29572 minU: 7.09082e-07 maxU: 3.51039 minV: 1.59252e-10 maxV: 1.31237 minP: 0 maxP: 1.81222 + +Writing Binary Restart File....done +tstep = 800, solver iterations: U - 4, V - 6, P - 5 +Step: 801 Time: 7.19511 minU: 2.47568e-06 maxU: 2.84548 minV: 2.24559e-10 maxV: 1.29519 minP: 0 maxP: 1.89712 + +MPI(4) + +Step: 1 Time: 0 minU: 1 maxU: 1 minV: 0 maxV: 0 minP: 0 maxP: 0 +tstep = 100, solver iterations: U - 5, V - 6, P - 5 +Step: 101 Time: 0.899388 minU: 4.24509e-05 maxU: 3.60427 minV: 8.49773e-09 maxV: 1.60015 minP: 0 maxP: 2.09905 + +Writing Binary Restart File....done +tstep = 200, solver iterations: U - 4, V - 6, P - 5 +Step: 201 Time: 1.79878 minU: 9.12985e-05 maxU: 3.22414 minV: 3.84348e-09 maxV: 1.50416 minP: 0 maxP: 1.67956 + +Writing Binary Restart File....done +tstep = 300, solver iterations: U - 4, V - 6, P - 5 +Step: 301 Time: 2.69817 minU: 4.83735e-05 maxU: 3.23234 minV: 6.62942e-11 maxV: 1.46158 minP: 0 maxP: 1.41877 + +Writing Binary Restart File....done +tstep = 400, solver iterations: U - 4, V - 6, P - 5 +Step: 401 Time: 3.59755 minU: 5.14672e-05 maxU: 3.83142 minV: 4.2182e-09 maxV: 1.40855 minP: 0 maxP: 1.48998 + +Writing Binary Restart File....done +tstep = 500, solver iterations: U - 4, V - 6, P - 5 +Step: 501 Time: 4.49694 minU: 1.97316e-05 maxU: 2.84452 minV: 8.45863e-10 maxV: 1.34087 minP: 0 maxP: 1.7267 + +Writing Binary Restart File....done +tstep = 600, solver iterations: U - 4, V - 6, P - 5 +Step: 601 Time: 5.39633 minU: 1.44322e-05 maxU: 3.02015 minV: 1.46332e-09 maxV: 1.32105 minP: 0 maxP: 1.80392 + +Writing Binary Restart File....done +tstep = 700, solver iterations: U - 4, V - 6, P - 5 +Step: 701 Time: 6.29572 minU: 7.35894e-06 maxU: 3.04242 minV: 6.70193e-10 maxV: 1.30973 minP: 0 maxP: 1.81706 + +Writing Binary Restart File....done +tstep = 800, solver iterations: U - 4, V - 6, P - 5 +Step: 801 Time: 7.19511 minU: 4.61542e-07 maxU: 3.6222 minV: 8.54138e-10 maxV: 1.29296 minP: 0 maxP: 1.91073 + diff --git a/tests/ogs/ins/setupInsTri2D.rc b/tests/ogs/ins/setupInsTri2D.rc new file mode 100644 index 000000000..20ce4c839 --- /dev/null +++ b/tests/ogs/ins/setupInsTri2D.rc @@ -0,0 +1,211 @@ +[FORMAT] +1.0 + +[DATA FILE] +data/insUniform2D.h +#data/insVortex2D.h + +[MESH FILE] +../../../meshes/insSquareCylinder2D.msh + +[MESH DIMENSION] +2 + +[ELEMENT TYPE] # number of edges +3 + +[POLYNOMIAL DEGREE] +4 + +[THREAD MODEL] +CUDA + +[PLATFORM NUMBER] +0 + +[DEVICE NUMBER] +0 + +# can be EXTBDF1,2, or 3, or ARK1 +# can add SUBCYCLING STEPS with EXTBDF +[TIME INTEGRATOR] +#ARK2 +EXTBDF2 + +[SUBCYCLING STEPS] +4 + +# can be CUBATURE or COLLOCATION +[ADVECTION TYPE] +CUBATURE + +[VISCOSITY] +0.001 + +[MACH NUMBER] +.2 + +[UBAR] +1.0 + +[VBAR] +0.0 + +[PBAR] +0.0 + +#zero to ignore +[TSTEPS FOR SOLUTION OUTPUT] +100 + +# zero to ignore +[TSTEPS FOR FORCE OUTPUT] +0 + +# zero to ignore, only ARK stepper +[TSTEPS FOR TIME STEP ADAPT] +100 + +[START TIME] +0 + +[FINAL TIME] +75.0 + +[CFL] +0.5 + +[OUTPUT TYPE] +VTU + +#Tested only EXTBDF currently +[RESTART FROM FILE] +0 + +[WRITE RESTART FILE] +1 + +[RESTART FILE NAME] +insRestartTri2D + +[OUTPUT FILE NAME] +Tins + +################################################# +########## Velocity Solver Options ############## +################################################# + +# can add FLEXIBLE to PCG +[VELOCITY KRYLOV SOLVER] +PCG + +# can be IPDG, or CONTINUOUS +[VELOCITY DISCRETIZATION] +IPDG + +# can be NODAL or BERN +[VELOCITY BASIS] +NODAL + +# can be NONE, JACOBI, MASSMATRIX, FULLALMOND, SEMFEM, or MULTIGRID +[VELOCITY PRECONDITIONER] +MASSMATRIX + +########## MULTIGRID Options ############## + +# can be ALLDEGREES, HALFDEGREES, HALFDOFS +[VELOCITY MULTIGRID COARSENING] +HALFDOFS + +# can be LOCALPATCH, or DAMPEDJACOBI +# LOCALPATCH smoother can include EXACT +# can include CHEBYSHEV for smoother acceleration +[VELOCITY MULTIGRID SMOOTHER] +DAMPEDJACOBI,CHEBYSHEV + +# can be any integer >0 +[MULTIGRID CHEBYSHEV DEGREE] +2 + +########################################### + +########## ParAlmond Options ############## + +# can be KCYCLE, or VCYCLE +# can add the EXACT and NONSYM option +[VELOCITY PARALMOND CYCLE] +KCYCLE + +# can be DAMPEDJACOBI or CHEBYSHEV +[VELOCITY PARALMOND SMOOTHER] +CHEBYSHEV + +# can be STRONGNODES, DISTRIBUTED, SATURATE +[VELOCITY PARALMOND PARTITION] +STRONGNODES + +# can be any integer >0 +[PARALMOND CHEBYSHEV DEGREE] +2 + +########################################### + +################################################# +########## Pressure Solver Options ############## +################################################# + +# can add FLEXIBLE to PCG +[PRESSURE KRYLOV SOLVER] +PCG,FLEXIBLE + +# can be IPDG, or CONTINUOUS +[PRESSURE DISCRETIZATION] +CONTINUOUS + +# can be NODAL or BERN +[PRESSURE BASIS] +NODAL + +# can be NONE, JACOBI, MASSMATRIX, FULLALMOND, SEMFEM, or MULTIGRID +[PRESSURE PRECONDITIONER] +MULTIGRID + +########## MULTIGRID Options ############## + +# can be ALLDEGREES, HALFDEGREES, HALFDOFS +[PRESSURE MULTIGRID COARSENING] +HALFDOFS + +# can be LOCALPATCH, or DAMPEDJACOBI +# LOCALPATCH smoother can include EXACT +# can include CHEBYSHEV for smoother acceleration +[PRESSURE MULTIGRID SMOOTHER] +DAMPEDJACOBI,CHEBYSHEV + +########################################### + +########## ParAlmond Options ############## + +# can be KCYCLE, or VCYCLE +# can add the EXACT and NONSYM option +[PRESSURE PARALMOND CYCLE] +KCYCLE + +# can be DAMPEDJACOBI or CHEBYSHEV +[PRESSURE PARALMOND SMOOTHER] +CHEBYSHEV + +# can be STRONGNODES, DISTRIBUTED, SATURATE +[PRESSURE PARALMOND PARTITION] +STRONGNODES + +########################################### + +# compare to a reference solution. Use NONE to skip comparison +# can be VORTEX or KOVASZNAY +[EXACT] +NONE +#VORTEX + +[VERBOSE] +FALSE \ No newline at end of file