Skip to content

Commit

Permalink
tl updates
Browse files Browse the repository at this point in the history
  • Loading branch information
gliptic committed Aug 4, 2016
1 parent 7954259 commit f7b8c3a
Show file tree
Hide file tree
Showing 47 changed files with 3,636 additions and 1,077 deletions.
6 changes: 6 additions & 0 deletions _build/Minimal size.props
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,19 @@
<SmallerTypeCheck>false</SmallerTypeCheck>
<RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
<BufferSecurityCheck>false</BufferSecurityCheck>
<ControlFlowGuard>false</ControlFlowGuard>
<ExceptionHandling>false</ExceptionHandling>
<FunctionLevelLinking>true</FunctionLevelLinking>
<DisableSpecificWarnings>4577</DisableSpecificWarnings>
</ClCompile>
<Link>
<OptimizeReferences>true</OptimizeReferences>
</Link>
<Link>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<AdditionalOptions>/MERGE:.rdata=.text %(AdditionalOptions)</AdditionalOptions>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
<IgnoreAllDefaultLibraries>true</IgnoreAllDefaultLibraries>
</Link>
</ItemDefinitionGroup>
<ItemGroup />
Expand Down
4 changes: 4 additions & 0 deletions _build/Tl Standard Config.props
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@
<PreprocessorDefinitions>_CRT_NONSTDC_NO_WARNINGS;_SECURE_SCL=0;_CRT_SECURE_NO_DEPRECATE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<ObjectFileName>$(IntDir)\%(Directory)</ObjectFileName>
<FloatingPointModel>Precise</FloatingPointModel>
<WarningLevel>Level4</WarningLevel>
<ExceptionHandling>false</ExceptionHandling>
<RuntimeTypeInfo>false</RuntimeTypeInfo>
<BufferSecurityCheck>false</BufferSecurityCheck>
</ClCompile>
</ItemDefinitionGroup>
<ItemGroup />
Expand Down
Binary file modified _build/msvcrt6.lib
Binary file not shown.
Binary file modified _build/msvcrt64.lib
Binary file not shown.
547 changes: 74 additions & 473 deletions _build/tl.vcxproj

Large diffs are not rendered by default.

35 changes: 33 additions & 2 deletions _build/tl.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,18 @@
<ClCompile Include="..\windows\runtime_vcpp.cpp">
<Filter>windows</Filter>
</ClCompile>
<ClCompile Include="..\image.cpp" />
<ClCompile Include="..\windows\stream.cpp">
<Filter>windows</Filter>
</ClCompile>
<ClCompile Include="..\stream.cpp" />
<ClCompile Include="..\approxmath\sincos.cpp">
<Filter>approxmath</Filter>
</ClCompile>
<ClCompile Include="..\string_set.cpp" />
<ClCompile Include="..\vec.cpp" />
<ClCompile Include="..\string.cpp" />
<ClCompile Include="..\bits.c" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\inflate.h" />
Expand Down Expand Up @@ -288,11 +300,25 @@
<ClInclude Include="..\utf8.h" />
<ClInclude Include="..\strscan.h" />
<ClInclude Include="..\char.h" />
<ClInclude Include="..\vec.hpp" />
<ClInclude Include="..\vector.hpp" />
<ClInclude Include="..\stream.h" />
<ClInclude Include="..\stream.hpp" />
<ClInclude Include="..\vector.h" />
<ClInclude Include="..\image.hpp" />
<ClInclude Include="..\windows\win.hpp">
<Filter>windows</Filter>
</ClInclude>
<ClInclude Include="..\rect.hpp" />
<ClInclude Include="..\string.hpp" />
<ClInclude Include="..\approxmath\am.hpp">
<Filter>approxmath</Filter>
</ClInclude>
<ClInclude Include="..\rand.hpp" />
<ClInclude Include="..\string_set.hpp" />
<ClInclude Include="..\shared_ptr.hpp" />
<ClInclude Include="..\vector_old.hpp" />
<ClInclude Include="..\vec.hpp" />
<ClInclude Include="..\vector.hpp" />
<ClInclude Include="..\filesystem.hpp" />
</ItemGroup>
<ItemGroup>
<Filter Include="fdlibm">
Expand All @@ -313,4 +339,9 @@
<Filter>codec</Filter>
</None>
</ItemGroup>
<ItemGroup>
<MASM Include="..\windows\allmul.asm">
<Filter>windows</Filter>
</MASM>
</ItemGroup>
</Project>
16 changes: 16 additions & 0 deletions approxmath/am.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#ifndef TL_APPROXMATH_HPP
#define TL_APPROXMATH_HPP 1

#include "../vector.hpp"

namespace tl {

static f64 const pi = 3.1415926535897932384626433832795;
static f64 const pi2 = 6.283185307179586476925286766559;

VectorD2 sincos(double x);

}

#endif // TL_APPROXMATH_HPP

259 changes: 184 additions & 75 deletions approxmath/sincos.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,187 @@
#include "../bits.h"
#include <stdint.h>


float am_sinf(float x) {
uint32_t a, c, d;
__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
xmm0 = _mm_load_ss(&x);
xmm1 = _mm_load_ss((float const*)_ps_am_inv_sign_mask);
a = tl_ftourep(x);
xmm0 = _mm_mul_ss(xmm0, _mm_load_ss(_ps_am_2_o_pi));
xmm0 = _mm_and_ps(xmm0, xmm1);
a &= 0x80000000;

c = _mm_cvttss_si32(xmm0);
xmm1 = _mm_load_ss(_ps_am_1);
d = c;
d <<= (31 - 1);
xmm2 = _mm_cvtsi32_ss(xmm2, c);
c &= 1;
d &= 0x80000000;

xmm0 = _mm_sub_ss(xmm0, xmm2);
xmm6 = _mm_load_ss((float*)&_sincos_masks[c]);
xmm0 = _mm_min_ss(xmm0, xmm1);

xmm5 = _mm_load_ss(_ps_sincos_p3);
xmm1 = _mm_sub_ss(xmm1, xmm0);

xmm1 = _mm_and_ps(xmm1, xmm6);
xmm6 = _mm_andnot_ps(xmm6, xmm0);
xmm1 = _mm_or_ps(xmm1, xmm6);
xmm4 = _mm_load_ss(_ps_sincos_p2);
xmm0 = xmm1;

xmm1 = _mm_mul_ss(xmm1, xmm1);
xmm7 = _mm_load_ss(_ps_sincos_p1);
a ^= d;
xmm2 = xmm1;
xmm1 = _mm_mul_ss(xmm1, xmm5);
xmm5 = _mm_load_ss(_ps_sincos_p0);
xmm1 = _mm_add_ss(xmm1, xmm4);
xmm1 = _mm_mul_ss(xmm1, xmm2);
xmm3 = _mm_load_ss((float const*)&a);
xmm1 = _mm_add_ss(xmm1, xmm7);
xmm1 = _mm_mul_ss(xmm1, xmm2);
xmm0 = _mm_or_ps(xmm0, xmm3);
xmm1 = _mm_add_ss(xmm1, xmm5);
xmm0 = _mm_mul_ss(xmm0, xmm1);

_mm_store_ss(&x, xmm0);

return x;
}

#if 0 // NOT Finished
am_pair am_sincosf(float x)
{
am_pair r;
float temp1, temp2;

uint32_t a, c, d, s;
__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;

xmm0 = _mm_load_ss(&x);
xmm1 = _mm_load_ss((float const*)_ps_am_inv_sign_mask);
a = tl_ftourep(x);
xmm0 = _mm_mul_ss(xmm0, _mm_load_ss(_ps_am_2_o_pi));
xmm0 = _mm_and_ps(xmm0, xmm1);
a &= 0x80000000;

d = _mm_cvttss_si32(xmm0);
c = d;
s = d;
++d;
c <<= (31 - 1);
d <<= (31 - 1);

__asm
{
movss xmm1, _ps_am_inv_sign_mask // ok
mov eax, x // ok
mulss xmm0, _ps_am_2_o_pi // ok
andps xmm0, xmm1 // ok
and eax, 0x80000000 // ok

cvttss2si edx, xmm0 //
mov ecx, edx //
mov esi, edx //
add edx, 0x1 //
shl ecx, (31 - 1) //
shl edx, (31 - 1) //

movss xmm4, _ps_am_1
cvtsi2ss xmm3, esi
mov temp1, eax
and esi, 0x1

subss xmm0, xmm3
movss xmm3, _sincos_inv_masks[esi * 4]
minss xmm0, xmm4

subss xmm4, xmm0

movss xmm6, xmm4
andps xmm4, xmm3
and ecx, 0x80000000
movss xmm2, xmm3
andnps xmm3, xmm0
and edx, 0x80000000
movss xmm7, temp1
andps xmm0, xmm2
mov temp1, ecx
mov temp2, edx
orps xmm4, xmm3

andnps xmm2, xmm6
orps xmm0, xmm2

movss xmm2, temp1
movss xmm1, xmm0
movss xmm5, xmm4
xorps xmm7, xmm2
movss xmm3, _ps_sincos_p3
mulss xmm0, xmm0
mulss xmm4, xmm4
movss xmm2, xmm0
movss xmm6, xmm4
orps xmm1, xmm7
movss xmm7, _ps_sincos_p2
mulss xmm0, xmm3
mulss xmm4, xmm3
movss xmm3, _ps_sincos_p1
addss xmm0, xmm7
addss xmm4, xmm7
movss xmm7, _ps_sincos_p0
mulss xmm0, xmm2
mulss xmm4, xmm6
addss xmm0, xmm3
addss xmm4, xmm3
movss xmm3, temp2
mulss xmm0, xmm2
mulss xmm4, xmm6
orps xmm5, xmm3
addss xmm0, xmm7
addss xmm4, xmm7
mulss xmm0, xmm1
mulss xmm4, xmm5

movss r.first, xmm0
movss r.second, xmm4

//ret 16 + 4 + 4 + 8
//ret
}
return r;
}

#endif

/*
float am_sinf(float x)
{
float r;
_mm_store_ss(&r, am_sin_ss(_mm_load_ss(&x)));
return r;
}*/

float am_sinf_2(float x)
{
float r;
_mm_store_ss(&r, am_sin_ess(_mm_load_ss(&x)));
return r;
}

float am_cosf(float x)
{
float r;
_mm_store_ss(&r, am_cos_ss(_mm_load_ss(&x)));
return r;
}

#if TL_X86

am_pair am_sincosf(float x)
{
am_pair r;
Expand Down Expand Up @@ -141,80 +322,6 @@ float am_sinf_inline(float x)
return x;
}

float am_sinf(float x)
{
uint32_t a, c, d;
__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
xmm0 = _mm_load_ss(&x);
xmm1 = _mm_load_ss((float*)_ps_am_inv_sign_mask);
a = tl_ftourep(x);
xmm0 = _mm_mul_ss(xmm0, _mm_load_ss(_ps_am_2_o_pi));
xmm0 = _mm_and_ps(xmm0, xmm1);
a &= 0x80000000;

c = _mm_cvttss_si32(xmm0);
xmm1 = _mm_load_ss(_ps_am_1);
d = c;
d <<= (31 - 1);
xmm2 = _mm_cvtsi32_ss(xmm2, c);
c &= 1;
d &= 0x80000000;

xmm0 = _mm_sub_ss(xmm0, xmm2);
xmm6 = _mm_load_ss((float*)&_sincos_masks[c]);
xmm0 = _mm_min_ss(xmm0, xmm1);

xmm5 = _mm_load_ss(_ps_sincos_p3);
xmm1 = _mm_sub_ss(xmm1, xmm0);

xmm1 = _mm_and_ps(xmm1, xmm6);
xmm6 = _mm_andnot_ps(xmm6, xmm0);
xmm1 = _mm_or_ps(xmm1, xmm6);
xmm4 = _mm_load_ss(_ps_sincos_p2);
xmm0 = xmm1;

xmm1 = _mm_mul_ss(xmm1, xmm1);
xmm7 = _mm_load_ss(_ps_sincos_p1);
a ^= d;
xmm2 = xmm1;
xmm1 = _mm_mul_ss(xmm1, xmm5);
xmm5 = _mm_load_ss(_ps_sincos_p0);
xmm1 = _mm_add_ss(xmm1, xmm4);
xmm1 = _mm_mul_ss(xmm1, xmm2);
xmm3 = _mm_load_ss((float const*)&a);
xmm1 = _mm_add_ss(xmm1, xmm7);
xmm1 = _mm_mul_ss(xmm1, xmm2);
xmm0 = _mm_or_ps(xmm0, xmm3);
xmm1 = _mm_add_ss(xmm1, xmm5);
xmm0 = _mm_mul_ss(xmm0, xmm1);

_mm_store_ss(&x, xmm0);

return x;
}

/*
float am_sinf(float x)
{
float r;
_mm_store_ss(&r, am_sin_ss(_mm_load_ss(&x)));
return r;
}*/

float am_sinf_2(float x)
{
float r;
_mm_store_ss(&r, am_sin_ess(_mm_load_ss(&x)));
return r;
}

float am_cosf(float x)
{
float r;
_mm_store_ss(&r, am_cos_ss(_mm_load_ss(&x)));
return r;
}

__m128 __declspec(naked) __cdecl am_sin_ss(__m128 x)
{
__asm
Expand Down Expand Up @@ -367,4 +474,6 @@ __m128 __declspec(naked) __cdecl am_sin_ess(__m128 x)

ret
}
}
}

#endif
Loading

0 comments on commit f7b8c3a

Please sign in to comment.