diff --git a/cmake/adjust_global_compile_flags.cmake b/cmake/adjust_global_compile_flags.cmake
index dbbf685346532..d332d95d96789 100644
--- a/cmake/adjust_global_compile_flags.cmake
+++ b/cmake/adjust_global_compile_flags.cmake
@@ -45,7 +45,7 @@ if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
   if (onnxruntime_ENABLE_WEBASSEMBLY_DEBUG_INFO)
     # "-g3" generates DWARF format debug info.
     # NOTE: With debug info enabled, web assembly artifacts will be very huge (>1GB). So we offer an option to build without debug info.
-    set(CMAKE_CXX_FLAGS_DEBUG "-g3")
+    set(CMAKE_CXX_FLAGS_DEBUG "-g2")
   else()
     set(CMAKE_CXX_FLAGS_DEBUG "-g2")
   endif()
diff --git a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc
index b711887690f47..fee4e6d3fa939 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc
@@ -263,17 +263,22 @@ Status MatMulIntegerToFloatBase::ComputeCommon(OpKernelContext* ctx,
   //std::cout << "Calling f32Multiply\n";
   // should split in parts and call ctx.ParallelFor just on the rows part
 
+#if 0
   // rowsA = M
   // width = K
   // colsB = N
-#if 0
   size_t rowsA = static_cast<size_t>(helper.M());
 
   if (rowsA > 1) {
   size_t width = static_cast<size_t>(helper.K());
   size_t colsB = static_cast<size_t>(helper.N());
-  
   const int8_t* b_data = static_cast<const int8_t*>(b_tensor->DataRaw());
+  //std::cout << "Calling GeckoMatmulIntegerToFloat\n";
+  //int threads = concurrency::ThreadPool::DegreeOfParallelism(ctx->GetOperatorThreadPool());
+  //std::cout << "degree of parallelism: " << threads << "\n";
+  //std::cout << "batch size: " << num_gemms << "\n";
+
+ 
 
   GeckoMatmulIntegerToFloat(a_data, 
               a_zp,  
@@ -291,7 +296,7 @@ Status MatMulIntegerToFloatBase::ComputeCommon(OpKernelContext* ctx,
 #endif
     MlasGemmBatch(gemm_shape, gemm_data_vec.data(), num_gemms, ctx->GetOperatorThreadPool());
   
-  //}
+ // }
 
  //
   /* 
diff --git a/onnxruntime/core/mlas/lib/qgemm.cpp b/onnxruntime/core/mlas/lib/qgemm.cpp
index 026a1215af42c..c5bbc9f93a9f2 100644
--- a/onnxruntime/core/mlas/lib/qgemm.cpp
+++ b/onnxruntime/core/mlas/lib/qgemm.cpp
@@ -61,7 +61,6 @@ Return Value:
 {
     const ptrdiff_t ThreadIdM = ThreadId / WorkBlock->ThreadCountN;
     const ptrdiff_t ThreadIdN = ThreadId % WorkBlock->ThreadCountN;
-
     //
     // Partition the operation along the M dimension.
     //
@@ -197,16 +196,11 @@ MlasGemmBatch(
         WorkBlock.ThreadCountN = 1;
     }
     TargetThreadCount = ThreadsPerGemm * BatchN;
-    //std::cout << "ThreadsPerGemm: " << ThreadsPerGemm << std::endl;
-    //std::cout << "TargetThreadCount: " << TargetThreadCount << std::endl;
-    //std::cout << "MaximumThreadCount: " << MaximumThreadCount << std::endl;
-
 
 
     MlasTrySimpleParallel(ThreadPool, TargetThreadCount, [&](ptrdiff_t tid) {
         const auto gemm_i = tid / ThreadsPerGemm;
         const auto blk_i = tid % ThreadsPerGemm;
-        //std::cout << "gemm_i: " << gemm_i << " blk_i: " << blk_i << std::endl;
         MlasGemmQuantThreaded(&WorkBlock, &Shape, &DataParams[gemm_i], blk_i);
     });
 }
diff --git a/onnxruntime/core/providers/cpu/math/matmul.cc b/onnxruntime/core/providers/cpu/math/matmul.cc
index 2c6d23e4de908..b43d7344d7081 100644
--- a/onnxruntime/core/providers/cpu/math/matmul.cc
+++ b/onnxruntime/core/providers/cpu/math/matmul.cc
@@ -289,8 +289,12 @@ Status MatMul<float>::Compute(OpKernelContext* ctx) const {
       data[i].alpha = alpha_attr_;
       data[i].beta = 0.0f;
     }
+    
+//auto start = std::chrono::steady_clock::now();
     MlasGemmBatch(trans_a ? CblasTrans : CblasNoTrans, trans_b ? CblasTrans : CblasNoTrans,
                   M, N, K, data.data(), max_len, thread_pool);
+    //auto end = std::chrono::steady_clock::now();
+    //std::cout << "MatMul<float>," << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << "," << max_len << std::endl;
   }
   return Status::OK();
 }
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 223eed248800e..0bca1b38362fa 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -1695,6 +1695,8 @@ common::Status InferenceSession::Initialize() {
   if (session_profiler_.IsEnabled()) {
     tp = session_profiler_.Start();
   }
+//std::cout << "session Initialize" << std::endl;
+  //auto startInit = std::chrono::steady_clock::now();
 
   ORT_TRY {
     LOGS(*session_logger_, INFO) << "Initializing session.";
@@ -1720,6 +1722,9 @@ common::Status InferenceSession::Initialize() {
     }
 
     // Verify that there are no external initializers in the graph if external data is disabled.
+      //std::cout << "session Initialize loading main graph" << std::endl;
+
+
     onnxruntime::Graph& graph = model_->MainGraph();
 #ifdef DISABLE_EXTERNAL_INITIALIZERS
     const InitializedTensorSet& initializers = graph.GetAllInitializedTensors();
@@ -1767,6 +1772,8 @@ common::Status InferenceSession::Initialize() {
     TraceLoggingWriteStart(session_activity, "OrtInferenceSessionActivity");
     session_activity_started_ = true;
 #endif
+      //std::cout << "session Initialize - creating state" << std::endl;
+
 
     // now that we have all the execution providers, create the session state
     session_state_ = std::make_unique<SessionState>(
@@ -1824,6 +1831,10 @@ common::Status InferenceSession::Initialize() {
     }();
 
     if (!loading_ort_format) {
+        //std::cout << "session Initialize not using ort" << std::endl;
+
+
+
 #if !defined(ORT_MINIMAL_BUILD)
       const auto minimal_build_opt_config_value = session_options_.config_options.GetConfigOrDefault(
           kOrtSessionOptionsConfigMinimalBuildOptimizations, "");
@@ -1845,6 +1856,10 @@ common::Status InferenceSession::Initialize() {
                                                                *session_logger_));
 
 #ifdef USE_DML
+          //    std::cout << "session Initialize using DML" << std::endl;
+
+
+
       const IExecutionProvider* dmlExecutionProvider = execution_providers_.Get(kDmlExecutionProvider);
 
       if (dmlExecutionProvider) {
@@ -1900,10 +1915,16 @@ common::Status InferenceSession::Initialize() {
 #endif
 
       // apply any transformations to the main graph and any subgraphs
+      //auto start = std::chrono::steady_clock::now();
       ORT_RETURN_IF_ERROR_SESSIONID_(TransformGraph(graph, saving_ort_format));
+      //auto end = std::chrono::steady_clock::now();
+      //std::cout << "Graph transformations took " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << " ms" << std::endl;
 
       // now that all the transforms are done, call Resolve on the main graph. this will recurse into the subgraphs.
+      //start = std::chrono::steady_clock::now();
       ORT_RETURN_IF_ERROR_SESSIONID_(graph.Resolve());
+      //end = std::chrono::steady_clock::now();
+      //std::cout << "Graph resolution took " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << " ms" << std::endl;
 
       // Currently graph capture is only considered by CUDA EP, TRT EP, ROCM EP and JS EP.
       //
@@ -2052,6 +2073,9 @@ common::Status InferenceSession::Initialize() {
                           "Loading anything other than ORT format models is not enabled in this build."));
 #endif  // !defined(ORT_MINIMAL_BUILD)
     } else {
+      //std::cout << "session Initialize - loading ort" << std::endl;
+
+
       ORT_RETURN_IF_ERROR_SESSIONID_(PartitionOrtFormatModel(graph, execution_providers_, kernel_registry_manager_,
                                                              *session_state_, session_options_.config_options, *session_logger_));
 
@@ -2171,6 +2195,8 @@ common::Status InferenceSession::Initialize() {
     }
   }
 
+  //auto endInitialization = std::chrono::steady_clock::now();
+  //std::cout << "session Initialize - Initialization time: " << std::chrono::duration_cast<std::chrono::milliseconds>(endInitialization - startInit).count() << " ms" << std::endl;
   return status;
 }
 #if defined(_MSC_VER) && !defined(__clang__)
diff --git a/onnxruntime/core/util/math_cpu.cc b/onnxruntime/core/util/math_cpu.cc
index 983321593a92b..89fd6aac943c2 100644
--- a/onnxruntime/core/util/math_cpu.cc
+++ b/onnxruntime/core/util/math_cpu.cc
@@ -15,6 +15,8 @@
  */
 // Modifications Copyright (c) Microsoft.
 
+#include <iostream>
+#include <chrono>
 #include "core/util/math_cpuonly.h"
 #include "core/util/math.h"
 #include "core/framework/float16.h"