diff --git a/cleanup b/cleanup
deleted file mode 100755
index b9cc889..0000000
--- a/cleanup
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/usr/bin/env sh
-rm -f src/Makevars
diff --git a/scdemon/__init__.py b/scdemon/__init__.py
index 3a6b793..3ff2783 100644
--- a/scdemon/__init__.py
+++ b/scdemon/__init__.py
@@ -1,4 +1,4 @@
-from . import _core as core
+#from . import _core as core
 
 from .robust_se import robust_se_default, robust_prepare
 from .utils import (
diff --git a/scdemon/py_se.cpp b/scdemon/py_se.cpp
index 1fa708d..cb47c0c 100644
--- a/scdemon/py_se.cpp
+++ b/scdemon/py_se.cpp
@@ -4,51 +4,19 @@
 
 namespace py = pybind11;
 
-
-// todo fix by using .cast<float>() in se.hpp
-//template<typename TY, typename TU, typename TB>
-Eigen::ArrayXd py_robust_se_X(const py::EigenDRef<Eigen::MatrixXd> &X,
-			      const py::EigenDRef<Eigen::MatrixXd> &Y,
-                              double lambda=1e-10)
-{
-	return robust_se_X(X, Y, lambda, 1e-300);
-}
-
-Eigen::SparseMatrix<double> py_robust_se(const py::EigenDRef<Eigen::MatrixXd> &X,
-					 const py::EigenDRef<Eigen::MatrixXd> &Y,
-                                         double lambda=1e-10,
-					 double t_cutoff=6.5,
-					 bool abs_t=false)
-{
-	return robust_se(X, Y, lambda, 1e-300, t_cutoff, abs_t);
-}
-
-Eigen::MatrixXd py_ols_beta(const py::EigenDRef<Eigen::MatrixXd> &X,
-                            const py::EigenDRef<Eigen::MatrixXd> &Y,
-                            double lambda
-                            )
-{
-        return ols_beta(X, Y, lambda);
-}
-
-Eigen::MatrixXd py_ols_resid(const py::EigenDRef<Eigen::MatrixXd> &X,
-                             const py::EigenDRef<Eigen::MatrixXd> &Y,
-                             const py::EigenDRef<Eigen::MatrixXd> &beta)
+template<typename TU, typename TV>
+Eigen::SparseMatrix<float> py_robust_se(const py::EigenDRef<TU> &U,
+					const py::EigenDRef<TV> &V,
+					float lambda,
+					float t_cutoff,
+					bool abs_cutoff) 
 {
-	return ols_resid(X, Y, beta);
+	return intra_robust_se(U, V, lambda, t_cutoff, abs_cutoff);
 }
 
 PYBIND11_MODULE(_core, m) {
-	m.def("py_ols_beta", &py_ols_beta);
-    m.def("py_ols_resid", &py_ols_resid);
-	m.def("py_robust_se_X", &py_robust_se_X);
-	m.def("py_robust_se", &py_robust_se);
-	// <Eigen::MatrixXd, Eigen::MatrixXd, Eigen::MatrixXd>);
-	// m.def("robust_se_X_ddf", &py_robust_se_X<Eigen::MatrixXd, Eigen::MatrixXd, Eigen::MatrixXf>);
-	// m.def("robust_se_X_dfd", &py_robust_se_X<Eigen::MatrixXd, Eigen::MatrixXf, Eigen::MatrixXd>);
-	// m.def("robust_se_X_dff", &py_robust_se_X<Eigen::MatrixXd, Eigen::MatrixXf, Eigen::MatrixXf>);
-	// m.def("robust_se_X_fdd", &py_robust_se_X<Eigen::MatrixXf, Eigen::MatrixXd, Eigen::MatrixXd>);
-	// m.def("robust_se_X_fdf", &py_robust_se_X<Eigen::MatrixXf, Eigen::MatrixXd, Eigen::MatrixXf>);
-	// m.def("robust_se_X_ffd", &py_robust_se_X<Eigen::MatrixXf, Eigen::MatrixXf, Eigen::MatrixXd>);
-	// m.def("robust_se_X_fff", &py_robust_se_X<Eigen::MatrixXf, Eigen::MatrixXf, Eigen::MatrixXf>);  
+	m.def("py_robust_se_ff", &py_robust_se<Eigen::MatrixXf, Eigen::MatrixXf>);
+	m.def("py_robust_se_df", &py_robust_se<Eigen::MatrixXd, Eigen::MatrixXf>);
+	m.def("py_robust_se_fd", &py_robust_se<Eigen::MatrixXf, Eigen::MatrixXd>);
+	m.def("py_robust_se_dd", &py_robust_se<Eigen::MatrixXd, Eigen::MatrixXd>);
 }
diff --git a/scdemon/robust_se.py b/scdemon/robust_se.py
index 3b6a0b2..93ddf08 100644
--- a/scdemon/robust_se.py
+++ b/scdemon/robust_se.py
@@ -1,22 +1,23 @@
 import numpy as np
 import scipy.sparse
 
-from . import _core as core
-from .utils import ProgressManager, _interrupt_checker
 
-def _robust_se_X(i:int, V:np.ndarray, lamb:float=1e-10) -> np.ndarray:
-    return core.py_robust_se_X(V[:, i].astype("f8"), V.astype("f8"), lamb)
-
-def _robust_se(X:np.ndarray, V:np.ndarray, lamb:float, t_cutoff:float, abs_t:bool) -> scipy.sparse.csc_matrix:
-    return core.py_robust_se(X.astype("f8"), V.astype('f8'), lamb, t_cutoff, abs_t)
-
-def ols_resid(X:np.ndarray, Y:np.ndarray, beta:np.ndarray) -> np.ndarray:
-    return core.py_ols_resid(X.astype("f8"), Y.astype("f8"), beta.astype("f8"))
-
-def ols_beta(X:np.ndarray, Y:np.ndarray) -> np.ndarray:
-    return core.py_ols_beta(X.astype("f8"), Y.astype("f8"))
+#from .utils import ProgressManager, _interrupt_checker
+
+def _robust_se(U:np.ndarray, V:np.ndarray, lamb:float=100, t_cutoff:float=6.5, abs_t:bool=False) -> scipy.sparse.csc_matrix:
+    from . import _core as core
+    if U.dtype == np.float32 and V.dtype == np.float32:
+        return core.py_robust_se_ff(U, V, lamb, t_cutoff, abs_t)
+    elif U.dtype == np.float64 and V.dtype == np.float64:
+        return core.py_robust_se_dd(U, V, lamb, t_cutoff, abs_t)
+    elif U.dtype == np.float32 and V.dtype == np.float64:
+        return core.py_robust_se_fd(U, V, lamb, t_cutoff, abs_t)
+    elif U.dtype == np.float64 and V.dtype == np.float32:
+        return core.py_robust_se_df(U, V, lamb, t_cutoff, abs_t)
+    else:
+        raise NotImplementedError("Not implemented for numpy dtype")
 
-def robust_prepare(U:np.ndarray, V:np.ndarray, B=None, n_components:int=None, min_norm:float=1e-5, return_U=False):
+def robust_prepare(U:np.ndarray, V:np.ndarray, B=None, n_components:int=None, min_norm:float=1e-5, return_U=True, power:float=0):
     if U.shape[1] != V.shape[0]:
         raise ValueError("Shapes must match")
     if n_components != None:
@@ -24,24 +25,23 @@ def robust_prepare(U:np.ndarray, V:np.ndarray, B=None, n_components:int=None, mi
             raise ValueError("N components must be > 0")
         U = U[:, :n_components]
         V = V[:n_components, :]
+    bad_flag = np.linalg.norm(V, axis=0, ord=2) < min_norm
     if B is None:
-        B = np.ones((U.shape[0], 1))
+        return U, V[:, ~bad_flag], ~bad_flag
     if U.shape[0] != B.shape[0]:
         raise ValueError("U and B are not the correct shape")
     V1 = np.nan * np.ones_like(V)
-    bad_flag = np.linalg.norm(V, axis=0, ord=2) < min_norm
     V_svd = np.linalg.svd(V[:, ~bad_flag], full_matrices=False)
     U = U @ V_svd.U
-    qr = np.linalg.qr(ols_resid(X=B, Y=U, beta=ols_beta(X=B, Y=U)))
+    qr = np.linalg.qr(U - B @ (np.linalg.pinv(B) @ U))
     RS_svd = np.linalg.svd(qr.R @ np.diag(V_svd.S))
-    V1[:, ~bad_flag] = np.diag(RS_svd.S) @ RS_svd.Vh @ V_svd.Vh
-    dof = B.shape[0] - B.shape[1]
+    V1[:, ~bad_flag] = np.diag(RS_svd.S**power) @ RS_svd.Vh @ V_svd.Vh
     if return_U:
-        return V1, dof, qr.Q @ RS_svd.U
+        return qr.Q @ RS_svd.U @ np.diag(RS_svd.S**(1-power)), V1, ~bad_flag
     else:
-        return V1, dof, None
+        return None, V1, ~bad_flag
 
-def robust_se_default(U, V, B=None, t_cutoff:float=None, abs_t:bool=False, nominal_p_cutoff:float=0.05) -> scipy.sparse.csc_matrix:
+def robust_se_default(U, V, B=None, t_cutoff:float=None, abs_t:bool=False, lamb:float=100., nominal_p_cutoff:float=0.05, n_components:int=None, min_norm:float=1e-5) -> scipy.sparse.csc_matrix:
     """
     U: U from SVD
     V: V\Sigma from SVD
@@ -49,16 +49,15 @@ def robust_se_default(U, V, B=None, t_cutoff:float=None, abs_t:bool=False, nomin
     """
     if U.shape[1] != V.shape[0]:
         raise ValueError("U and V must have compatible dimensions : %d != %d" % (U.shape[1], V.shape[0]))
-    if B is None:
-        B = np.ones((U.shape[0], 1), dtype="f8")
-    elif U.shape[0] != B.shape[0]:
-        raise ValueError("U and B must have compatible dimensions : %d != %d" % (U.shape[0], B.shape[0]))
-    UpB = core.py_ols_beta(U.astype("f8"), B.astype("f8"))
-    UpU = core.py_ols_beta(U.astype("f8"), U.astype("f8"))
+    U, V, var_flag = robust_prepare(U, V, B=B, n_components=n_components, min_norm=min_norm, return_U=True)
     if t_cutoff is None:
         import scipy.stats
         t_cutoff = scipy.stats.t.isf(nominal_p_cutoff * V.shape[1]**-2,
-                                     B.shape[0] - B.shape[1])
-    return _robust_se(V, V, t_cutoff=t_cutoff, abs_t=abs_t)
+                                     max(U.shape[0] - V.shape[1] - 2, 1))
+    S = _robust_se(U, V, t_cutoff=t_cutoff, abs_t=abs_t, lamb=lamb)
+    I = np.ravel(np.where(var_flag))
+    trans = scipy.sparse.csr_matrix((np.ones_like(I), (I, np.arange(len(I)))),
+                                    shape=(len(var_flag), S.shape[0]))
+    return trans.dot(S).dot(trans.T)
 
 
diff --git a/src/implprogress.hpp b/src/implprogress.hpp
index c8f6523..bd6ce90 100644
--- a/src/implprogress.hpp
+++ b/src/implprogress.hpp
@@ -9,6 +9,9 @@
 #include <iostream>
 #endif
 
+#ifndef BAR_WIDTH
+#define BAR_WIDTH 60
+#endif
 class ImplProgress
 {
 private:
@@ -16,11 +19,11 @@ class ImplProgress
 	std::unique_ptr<Progress> p;
 #else
 	unsigned long total, current, last;
- 	bool aborted;
+	bool aborted;
 #endif
 public:
 #if defined(Rcpp_hpp)
-	ImplProgress(unsigned long count) : p(std::make_unique<Progress>(count, true))  {}
+	ImplProgress(unsigned long count) : p(std::make_unique<Progress>(count, true))	{}
 	~ImplProgress() = default;
 #else
 	ImplProgress(unsigned long count) : total(count), current(0), last(0), aborted(false) {}
@@ -48,10 +51,17 @@ class ImplProgress
 		double cur_percent = current / total;
 		if (cur_percent >= 0.01 + last_percent) {
 			int pct = 100 * cur_percent;
-			std::cout << pct << "%" << std::endl;
+			int pos = cur_percent * BAR_WIDTH;
+			std::string out(BAR_WIDTH, ' ');
+			for (int i = 0; i < BAR_WIDTH; i++) {
+				if (i < pos) { out[i] = '='; }
+				else if (i == pos) { out[i] = '>'; }
+			}
+			std::cout << "[" << out << "] " << pct << "%\r";
+			std::cout.flush();
 			last = current;
 		}
-#endif  
+#endif	
 	}
 	void increment(unsigned long count=1) {
 #if defined(Rcpp_hpp)
diff --git a/src/se.hpp b/src/se.hpp
index c65a592..1ea8ee8 100644
--- a/src/se.hpp
+++ b/src/se.hpp
@@ -4,137 +4,91 @@
 #include <Eigen/Dense>
 #include <Eigen/SparseCore>
 
-//#if defined(_USE_GSL)
-// #include <gsl/gsl_statistics.h>
-// #include <gsl/gsl_cdf.h>
-//#endif
 
 #include <algorithm>
 #include <random>
 #include <chrono>
 #include <numeric>
+#include <limits>
 #include <vector>
 #include "implprogress.hpp"
-/*
- * Reminder: Y has many columns, X should have 1.
- */
-template<typename TX, typename TY>
-Eigen::MatrixXd ols_beta(const Eigen::MatrixBase<TX> &X, const Eigen::MatrixBase<TY> &Y, double lambda)
-{
-  	if (lambda > 0) {
-        	Eigen::MatrixXd I = Eigen::MatrixXd::Identity(X.cols(), X.cols());
-                Eigen::MatrixXd A = (X.transpose() * X + lambda * I).eval();
-                Eigen::JacobiSVD<Eigen::MatrixXd> svd(A, Eigen::ComputeThinU | Eigen::ComputeThinV);
-                return svd.solve(X.transpose() * Y);
-        } else {
-        	return X.completeOrthogonalDecomposition().solve(Y).eval();
-        }
-}
 
-template<typename TX, typename TY, typename TB>
-Eigen::MatrixXd ols_resid(const Eigen::MatrixBase<TX> &X,
-			  const Eigen::MatrixBase<TY> &Y,
-			  const Eigen::MatrixBase<TB> &beta) {
-	return (Y - X * beta).eval();
-}
-
-/*
- * Y must be U1^+ U1) 
- */
-template<typename TX, typename TY>
-Eigen::ArrayXd robust_se_X(const Eigen::MatrixBase<TX> &Xmat,
-			   const Eigen::MatrixBase<TY> &Y,
-			   double lambda=1e-10,
-			   double epsilon=1e-300)
+template<typename TV>
+Eigen::Vector<typename TV::Scalar, Eigen::Dynamic>
+intra_ols_beta(const Eigen::MatrixBase<TV> &V,
+	       const Eigen::DiagonalMatrix<typename TV::Scalar, Eigen::Dynamic> &tuuituu,
+	       Eigen::Index i)
 {
-	Eigen::VectorXd X = Xmat.col(0).eval();
-	double X_sqnorm = std::max(X.squaredNorm(), epsilon);
-	// X pseudoinverse is X' / squared_norm
-	Eigen::VectorXd beta = (X.transpose() * Y).eval() / (X_sqnorm + lambda);
-	Eigen::VectorXd var = ((X.transpose() / X_sqnorm).cwiseAbs2() * (Y - X * beta.transpose()).cwiseAbs2()).eval();
-	Eigen::ArrayXd tval = beta.array();
-	return tval * var.cwiseMax(epsilon).array().rsqrt();
+	Eigen::Vector<typename TV::Scalar, Eigen::Dynamic> beta = (V.transpose() * tuuituu * V.col(i));
+	beta[i] = 0.; // Set to zero to make equivalent to subsetting V, but keeping dim.
+	return beta;
 }
 
-/* Each column of X versus Y */
-template<typename TX, typename TY>
-Eigen::ArrayXd robust_se_Y(const Eigen::MatrixBase<TX> &X,
-			   const Eigen::MatrixBase<TY> &Ymat,
-			   double lambda=1e-10,
-			   double epsilon=1e-300)
+template<typename TU, typename TV>
+Eigen::Vector<typename TU::Scalar, Eigen::Dynamic>
+intra_ols_resid(const Eigen::MatrixBase<TU> &U,
+		const Eigen::MatrixBase<TV> &V,
+		const Eigen::Vector<typename TV::Scalar, Eigen::Dynamic> &beta,
+		Eigen::Index i)
 {
-  
-	Eigen::VectorXd Y = Ymat.col(0);
-        Eigen::ArrayXd X_sqnorm = X.colwise().squaredNorm().cwiseMax(epsilon).array();
-	// X pseudoinverse is X' / squared_norm
-	Eigen::VectorXd beta = ((X.transpose() * Y).array() / (X_sqnorm + lambda)).matrix().eval();
-	Eigen::MatrixXd res2 = X * beta.asDiagonal();
-	res2.colwise() -= Y;
-	res2 = res2.cwiseAbs2();
-	//Eigen::MatrixXd res2 = (Y - (X * beta.asDiagonal()).colwise()).cwiseAbs2().eval();
-	Eigen::MatrixXd pinv2 = (X * (1/X_sqnorm).matrix().asDiagonal()).cwiseAbs2().eval();
-	Eigen::VectorXd var = (pinv2.array() * res2.array()).colwise().sum();
-	//Eigen::VectorXd var = ((X * (1/X_sqnorm).matrix().asDiagonal()).transpose().cwiseAbs2() * (Y - X * beta.asDiagonal()).cwiseAbs2()).eval();
-	Eigen::ArrayXd tval = beta.array();
-	return tval * var.cwiseMax(epsilon).array().rsqrt();
+	Eigen::Vector<typename TU::Scalar, Eigen::Dynamic> res_pc = (V.col(i) - V * beta).eval().template cast<typename TU::Scalar>();
+	return U * res_pc;
 }
-template<typename TY, typename TL>
-Eigen::VectorXd ols_beta_L(Eigen::VectorXd X, double X_sqnorm,
-                           const Eigen::MatrixBase<TY> &Y,
-                           const Eigen::ArrayBase<TL> &lambda)
+template<typename TU>
+Eigen::MatrixXf intra_ols_norm_mat(const Eigen::DiagonalMatrix<float, Eigen::Dynamic> &tuui,
+				   const Eigen::MatrixBase<TU> &U,
+				   const Eigen::Vector<typename TU::Scalar, Eigen::Dynamic> &resid)
 {
-  	Eigen::ArrayXd numer = (X.transpose() * Y).eval().array();
-        Eigen::ArrayXd denom = (lambda + X_sqnorm).eval();
-        Eigen::ArrayXd quot = numer / denom;
-        return quot.matrix();
+	// Using float for QR for caching/speed + space
+	Eigen::Index n = U.cols();
+	Eigen::MatrixXf pre_qr = (resid.asDiagonal() * U).template cast<float>().eval();
+	// Use in-place QR to speed up
+	Eigen::HouseholderQR<Eigen::Ref<Eigen::MatrixXf> > qr(pre_qr);
+	Eigen::MatrixXf R = qr.matrixQR().topLeftCorner(n, n).triangularView<Eigen::Upper>();
+	return R * tuui;
 }
-// template<typename TX, typename TY>
-// Eigen::MatrixXd robust_se_full(const Eigen::MatrixBase<TX> &X,
-// 			       const Eigen::MatrixBase<TY> &Ymat,
-// 			       double lambda=1e-10)
-// {
-  
-// }
 
-template<typename TX, typename TY, typename TL>
-Eigen::ArrayXd robust_se_L(const Eigen::MatrixBase<TX> &Xmat,
-			   const Eigen::MatrixBase<TY> &Y, /* V\Sigma?? */
-                           const Eigen::ArrayBase<TL> &lambda,
-			   double epsilon=1e-300)
+template<typename TV>
+Eigen::Array<typename TV::Scalar, Eigen::Dynamic, 1> intra_ols_se(const Eigen::MatrixBase<TV> &V,
+								  const Eigen::MatrixXf &shared,
+								  float epsilon=std::numeric_limits<float>::epsilon())
 {
-	Eigen::VectorXd X = Xmat.col(0).eval();
-	double X_sqnorm = std::max(X.squaredNorm(), epsilon);
-        Eigen::VectorXd beta = ols_beta_L(X, X_sqnorm, Y, lambda);
-	// X pseudoinverse is X' / squared_norm
-	Eigen::VectorXd var = ((X.transpose() / X_sqnorm).cwiseAbs2() * (Y - X * beta.transpose()).cwiseAbs2()).eval();
-	Eigen::ArrayXd tval = beta.array();
-	return tval * var.cwiseMax(epsilon).array().rsqrt();
+	// shared should be same dimension of R from qr(diag(resid) * U)
+	Eigen::Matrix<typename TV::Scalar, Eigen::Dynamic, Eigen::Dynamic> shared_casted = shared.template cast<typename TV::Scalar>();
+	Eigen::Vector<typename TV::Scalar, Eigen::Dynamic> se = (shared_casted * V).colwise().norm().eval();
+	return se.cwiseMax(epsilon).array();
 }
-template<typename TX, typename TY>
-Eigen::SparseMatrix<double> robust_se(const Eigen::MatrixBase<TX> &X,
-				      const Eigen::MatrixBase<TY> &Y,
-				      double lambda=1e-10,
-				      double epsilon=1e-300,
-				      double t_cutoff=6.5,
-				      bool abs_cutoff=false)
+template<typename TU, typename TV>
+Eigen::SparseMatrix<float> intra_robust_se(const Eigen::MatrixBase<TU> &U,
+					   const Eigen::MatrixBase<TV> &V,
+					   float lambda=100,
+					   float t_cutoff=6.5,
+					   bool abs_cutoff=false)
 {
-	Eigen::SparseMatrix<double> M(Y.cols(), X.cols());
-	ImplProgress p(X.cols());
+	const Eigen::VectorXf sigma_2 = U.colwise().squaredNorm().template cast<float>().eval();
+	const Eigen::ArrayXf sigma_2_inv = (1. / (sigma_2.array() + lambda*lambda));
+	const Eigen::DiagonalMatrix<float, Eigen::Dynamic> tuui = sigma_2_inv.matrix().asDiagonal();
+	const Eigen::DiagonalMatrix<typename TV::Scalar, Eigen::Dynamic> tuuituu = (sigma_2.array() / (sigma_2.array() + lambda*lambda)).matrix().template cast<typename TV::Scalar>().asDiagonal();
+	Eigen::SparseMatrix<float> M(V.cols(), V.cols());
+	ImplProgress p(V.cols());
 #if defined(_OPENMP)
 #pragma omp parallel 
 #endif
 	{
-		Eigen::SparseMatrix<double> local_mat(Y.cols(), X.cols());
+		Eigen::SparseMatrix<float> local_mat(V.cols(), V.cols());
 #if defined(_OPENMP)
 #pragma omp for nowait
 #endif
-		for (int i = 0; i < X.cols(); i++) {
+		for (Eigen::Index i = 0; i < V.cols(); i++) {
 			if (!p.check_abort()) {
-		        	p.increment();
-				Eigen::ArrayXd tv = robust_se_X(X.col(i), Y, lambda, epsilon);
+				p.increment();
+				Eigen::Vector<typename TV::Scalar, Eigen::Dynamic> beta = intra_ols_beta(V, tuuituu, i);
+				Eigen::MatrixXf shared = intra_ols_norm_mat(tuui, U, intra_ols_resid(U, V, beta, i));
+				Eigen::Array<typename TV::Scalar, Eigen::Dynamic, 1> se = intra_ols_se(V, shared);
+				Eigen::Array<typename TV::Scalar, Eigen::Dynamic, 1> tv = beta.array() / se;
 				for (int j = 0; j < tv.size(); j++) {
 					if (abs_cutoff && (t_cutoff <= -tv[j])) {
-						local_mat.insert(j, i) = -tv[j];
+						local_mat.insert(j, i) = tv[j];
 					} else if (tv[j] >= t_cutoff) {
 						local_mat.insert(j, i) = tv[j];
 					}
@@ -150,7 +104,7 @@ Eigen::SparseMatrix<double> robust_se(const Eigen::MatrixBase<TX> &X,
 		M.makeCompressed();
 		return M;
 	} else {
-		Eigen::SparseMatrix<double> Mbad(Y.cols(), X.cols());
+		Eigen::SparseMatrix<float> Mbad(V.cols(), V.cols());
 		return Mbad;
 	}
 }