From f829a12add380c78e50ecc86357d4a1b513f21f0 Mon Sep 17 00:00:00 2001
From: boyealkim <gangmul12@snu.ac.kr>
Date: Thu, 25 Jul 2019 20:42:13 +0900
Subject: [PATCH 1/4] fix stream_merge front algo

---
 src/stream_manager.cc | 50 +++++++++++++++++++++++++++----------------
 src/stream_manager.h  | 12 +++++++++--
 2 files changed, 41 insertions(+), 21 deletions(-)
diff --git a/src/stream_manager.cc b/src/stream_manager.cc
index 6cd62a228..6e02dcdb4 100644
--- a/src/stream_manager.cc
+++ b/src/stream_manager.cc
@@ -194,14 +194,17 @@ bool stream_operation::do_operation( gpgpu_sim *gpu )
         m_stream->record_next_done();
         } 
         break;
-    case stream_wait_event: {
+    case stream_wait_event:
         //only allows next op to go if event is done
         //otherwise stays in the stream queue
         printf("stream wait event processing...\n");
-        if(m_event->done())
+		  if(m_event->done()){
             printf("stream wait event done\n");
             m_stream->record_next_done();
         }
+		  else{
+			  return false;
+		  }
         break;
     default:
         abort();
@@ -232,6 +235,7 @@ stream_manager::stream_manager( gpgpu_sim *gpu, bool cuda_launch_blocking )
     m_service_stream_zero = false;
     m_cuda_launch_blocking = cuda_launch_blocking;
     pthread_mutex_init(&m_lock,NULL);
+	 m_last_stream = m_streams.begin();
 }
 
 bool stream_manager::operation( bool * sim)
@@ -330,22 +334,31 @@ stream_operation stream_manager::front()
             m_service_stream_zero = false;
         }
     }
-    
     if(!m_service_stream_zero)
     {
-        std::list<struct CUstream_st*>::iterator s;
-        for( s=m_streams.begin(); s != m_streams.end(); s++) {
-            CUstream_st *stream = *s;
-            if( !stream->busy() && !stream->empty() ) {
-                result = stream->next();
-                if( result.is_kernel() ) {
-                    unsigned grid_id = result.get_kernel()->get_uid();
-                    m_grid_id_to_stream[grid_id] = stream;
-                }
-                break;
-            }
-        }
-    }
+		 std::list<struct CUstream_st*>::iterator s = m_last_stream;
+		 if(m_last_stream == m_streams.end()){
+			 s = m_streams.begin();
+		 }
+		 else{
+			 s++;
+		 }
+       for(size_t ii = 0 ; ii < m_streams.size(); ii++, s++) {
+			 if(s == m_streams.end()){
+				 s = m_streams.begin();
+			 }
+			 m_last_stream = s;
+          CUstream_st *stream = *s;
+          if( !stream->busy() && !stream->empty() ) {
+              result = stream->next();
+              if( result.is_kernel() ) {
+                  unsigned grid_id = result.get_kernel()->get_uid();
+                  m_grid_id_to_stream[grid_id] = stream;
+              }
+              break;
+          }
+		 }
+	 }
     return result;
 }
 
@@ -370,7 +383,8 @@ void stream_manager::destroy_stream( CUstream_st *stream )
             break;
         }
     }
-    delete stream; 
+    delete stream;
+	 m_last_stream = m_streams.begin();
     pthread_mutex_unlock(&m_lock);
 }
 
@@ -384,7 +398,6 @@ bool stream_manager::concurrent_streams_empty()
     for( s=m_streams.begin(); s!=m_streams.end();++s ) {
         struct CUstream_st *stream = *s;
         if( !stream->empty() ) {
-            //stream->print(stdout);
             result = false;
             break;
         }
@@ -437,7 +450,6 @@ void stream_manager::print_impl( FILE *fp)
 void stream_manager::push( stream_operation op )
 {
     struct CUstream_st *stream = op.get_stream();
-
     // block if stream 0 (or concurrency disabled) and pending concurrent operations exist
     bool block= !stream || m_cuda_launch_blocking;
     while(block) {
diff --git a/src/stream_manager.h b/src/stream_manager.h
index 91d1b362d..ae1af9fa3 100644
--- a/src/stream_manager.h
+++ b/src/stream_manager.h
@@ -163,7 +163,6 @@ class stream_operation {
     void print( FILE *fp ) const;
     struct CUstream_st *get_stream() { return m_stream; }
     void set_stream( CUstream_st *stream ) { m_stream = stream; }
-
 private:
     struct CUstream_st *m_stream;
 
@@ -179,9 +178,9 @@ class stream_operation {
     const char *m_symbol;
     size_t m_offset;
 
+    struct CUevent_st *m_event;
     bool m_sim_mode;
     kernel_info_t *m_kernel;
-    struct CUevent_st *m_event;
 };
 
 struct CUevent_st {
@@ -193,6 +192,7 @@ struct CUevent_st {
       m_updates = 0;
       m_wallclock = 0;
       m_gpu_tot_sim_cycle = 0;
+		m_issued = 0;
       m_done = false;
    }
    void update( double cycle, time_t clk )
@@ -207,11 +207,18 @@ struct CUevent_st {
    unsigned num_updates() const { return m_updates; }
    bool done() const { return m_done; }
    time_t clock() const { return m_wallclock; }
+	void issue(){
+		m_issued++;
+	}
+	unsigned int num_issued() const{
+		return m_issued;
+	}
 private:
    int m_uid;
    bool m_blocking;
    bool m_done;
    int m_updates;
+	unsigned int m_issued;
    time_t m_wallclock;
    double m_gpu_tot_sim_cycle;
 
@@ -268,6 +275,7 @@ class stream_manager {
     CUstream_st m_stream_zero;
     bool m_service_stream_zero;
     pthread_mutex_t m_lock;
+	 std::list<struct CUstream_st*>::iterator m_last_stream;
 };
 
 #endif

From 8e9f0375a6a8bed217b0634a08f7f1f0cd149744 Mon Sep 17 00:00:00 2001
From: boyealkim <gangmul12@snu.ac.kr>
Date: Thu, 25 Jul 2019 20:31:21 +0900
Subject: [PATCH 2/4] fix stream fix bug and fix cudaStreamWaitEvent impl

---
 libcuda/cuda_runtime_api.cc | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/libcuda/cuda_runtime_api.cc b/libcuda/cuda_runtime_api.cc
index 44f0f4ea4..7005eef58 100644
--- a/libcuda/cuda_runtime_api.cc
+++ b/libcuda/cuda_runtime_api.cc
@@ -1696,6 +1696,7 @@ __host__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t s
 	if( !e ) return g_last_cudaError = cudaErrorUnknown;
 	struct CUstream_st *s = (struct CUstream_st *)stream;
 	stream_operation op(e,s);
+	e->issue();
 	g_stream_manager->push(op);
 	return g_last_cudaError = cudaSuccess;
 }
@@ -1708,9 +1709,13 @@ __host__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEven
    //reference: https://www.cs.cmu.edu/afs/cs/academic/class/15668-s11/www/cuda-doc/html/group__CUDART__STREAM_gfe68d207dc965685d92d3f03d77b0876.html 
 	CUevent_st *e = get_event(event);
 	if( !e ){
-	   printf("GPGPU-Sim API: Warning: cudaEventRecord has not been called on event before calling cudaStreamWaitEvent.\nNothing to be done.\n");
-      return g_last_cudaError = cudaSuccess;
+	   printf("GPGPU-Sim API: Error at cudaStreamWaitEvent. Event is not created .\n");
+		return g_last_cudaError = cudaErrorInvalidResourceHandle;
    }
+	else if(e->num_issued() == 0){
+		printf("GPGPU-Sim API: Warning: cudaEventRecord has not been called on event before calling cudaStreamWaitEvent.\nNothing to be done.\n");
+      return g_last_cudaError = cudaSuccess;
+	}
    if (!stream){
       g_stream_manager->pushCudaStreamWaitEventToAllStreams(e, flags);
    } else {

From b13557baa4ea6d1084bdcc9b28b9eb660c394427 Mon Sep 17 00:00:00 2001
From: boyealkim <gangmul12@snu.ac.kr>
Date: Fri, 26 Jul 2019 11:55:45 +0900
Subject: [PATCH 3/4] fix indentation

---
 libcuda/cuda_runtime_api.cc | 10 +++----
 src/stream_manager.cc       | 58 ++++++++++++++++++-------------------
 src/stream_manager.h        | 17 +++++------
 3 files changed, 40 insertions(+), 45 deletions(-)

diff --git a/libcuda/cuda_runtime_api.cc b/libcuda/cuda_runtime_api.cc
index 7005eef58..6a7826ee0 100644
--- a/libcuda/cuda_runtime_api.cc
+++ b/libcuda/cuda_runtime_api.cc
@@ -1709,13 +1709,13 @@ __host__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEven
    //reference: https://www.cs.cmu.edu/afs/cs/academic/class/15668-s11/www/cuda-doc/html/group__CUDART__STREAM_gfe68d207dc965685d92d3f03d77b0876.html 
 	CUevent_st *e = get_event(event);
 	if( !e ){
-	   printf("GPGPU-Sim API: Error at cudaStreamWaitEvent. Event is not created .\n");
-		return g_last_cudaError = cudaErrorInvalidResourceHandle;
+      printf("GPGPU-Sim API: Error at cudaStreamWaitEvent. Event is not created .\n");
+      return g_last_cudaError = cudaErrorInvalidResourceHandle;
    }
-	else if(e->num_issued() == 0){
-		printf("GPGPU-Sim API: Warning: cudaEventRecord has not been called on event before calling cudaStreamWaitEvent.\nNothing to be done.\n");
+   else if(e->num_issued() == 0){
+      printf("GPGPU-Sim API: Warning: cudaEventRecord has not been called on event before calling cudaStreamWaitEvent.\nNothing to be done.\n");
       return g_last_cudaError = cudaSuccess;
-	}
+   }
    if (!stream){
       g_stream_manager->pushCudaStreamWaitEventToAllStreams(e, flags);
    } else {
diff --git a/src/stream_manager.cc b/src/stream_manager.cc
index 6e02dcdb4..d0eecddba 100644
--- a/src/stream_manager.cc
+++ b/src/stream_manager.cc
@@ -198,13 +198,13 @@ bool stream_operation::do_operation( gpgpu_sim *gpu )
         //only allows next op to go if event is done
         //otherwise stays in the stream queue
         printf("stream wait event processing...\n");
-		  if(m_event->done()){
+        if(m_event->done()){
             printf("stream wait event done\n");
             m_stream->record_next_done();
         }
-		  else{
-			  return false;
-		  }
+        else{
+            return false;
+        }
         break;
     default:
         abort();
@@ -235,7 +235,7 @@ stream_manager::stream_manager( gpgpu_sim *gpu, bool cuda_launch_blocking )
     m_service_stream_zero = false;
     m_cuda_launch_blocking = cuda_launch_blocking;
     pthread_mutex_init(&m_lock,NULL);
-	 m_last_stream = m_streams.begin();
+    m_last_stream = m_streams.begin();
 }
 
 bool stream_manager::operation( bool * sim)
@@ -336,29 +336,25 @@ stream_operation stream_manager::front()
     }
     if(!m_service_stream_zero)
     {
-		 std::list<struct CUstream_st*>::iterator s = m_last_stream;
-		 if(m_last_stream == m_streams.end()){
-			 s = m_streams.begin();
-		 }
-		 else{
-			 s++;
-		 }
-       for(size_t ii = 0 ; ii < m_streams.size(); ii++, s++) {
-			 if(s == m_streams.end()){
-				 s = m_streams.begin();
-			 }
-			 m_last_stream = s;
-          CUstream_st *stream = *s;
-          if( !stream->busy() && !stream->empty() ) {
-              result = stream->next();
-              if( result.is_kernel() ) {
-                  unsigned grid_id = result.get_kernel()->get_uid();
-                  m_grid_id_to_stream[grid_id] = stream;
-              }
-              break;
-          }
-		 }
-	 }
+        std::list<struct CUstream_st*>::iterator s = m_last_stream;
+        if(m_last_stream == m_streams.end()){ s = m_streams.begin(); }
+        else{ s++; }
+        for(size_t ii = 0 ; ii < m_streams.size(); ii++, s++) {
+            if(s == m_streams.end()){
+                s = m_streams.begin();
+            }
+            m_last_stream = s;
+            CUstream_st *stream = *s;
+            if( !stream->busy() && !stream->empty() ) {
+                result = stream->next();
+                if( result.is_kernel() ) {
+                    unsigned grid_id = result.get_kernel()->get_uid();
+                    m_grid_id_to_stream[grid_id] = stream;
+                }
+                break;
+            }
+        }
+    }
     return result;
 }
 
@@ -383,8 +379,8 @@ void stream_manager::destroy_stream( CUstream_st *stream )
             break;
         }
     }
-    delete stream;
-	 m_last_stream = m_streams.begin();
+    delete stream; 
+    m_last_stream = m_streams.begin();
     pthread_mutex_unlock(&m_lock);
 }
 
@@ -398,6 +394,7 @@ bool stream_manager::concurrent_streams_empty()
     for( s=m_streams.begin(); s!=m_streams.end();++s ) {
         struct CUstream_st *stream = *s;
         if( !stream->empty() ) {
+            //stream->print(stdout);
             result = false;
             break;
         }
@@ -450,6 +447,7 @@ void stream_manager::print_impl( FILE *fp)
 void stream_manager::push( stream_operation op )
 {
     struct CUstream_st *stream = op.get_stream();
+
     // block if stream 0 (or concurrency disabled) and pending concurrent operations exist
     bool block= !stream || m_cuda_launch_blocking;
     while(block) {
diff --git a/src/stream_manager.h b/src/stream_manager.h
index ae1af9fa3..eb4c6ff06 100644
--- a/src/stream_manager.h
+++ b/src/stream_manager.h
@@ -163,6 +163,7 @@ class stream_operation {
     void print( FILE *fp ) const;
     struct CUstream_st *get_stream() { return m_stream; }
     void set_stream( CUstream_st *stream ) { m_stream = stream; }
+
 private:
     struct CUstream_st *m_stream;
 
@@ -178,9 +179,9 @@ class stream_operation {
     const char *m_symbol;
     size_t m_offset;
 
-    struct CUevent_st *m_event;
     bool m_sim_mode;
     kernel_info_t *m_kernel;
+    struct CUevent_st *m_event;
 };
 
 struct CUevent_st {
@@ -192,7 +193,7 @@ struct CUevent_st {
       m_updates = 0;
       m_wallclock = 0;
       m_gpu_tot_sim_cycle = 0;
-		m_issued = 0;
+      m_issued = 0;
       m_done = false;
    }
    void update( double cycle, time_t clk )
@@ -207,18 +208,14 @@ struct CUevent_st {
    unsigned num_updates() const { return m_updates; }
    bool done() const { return m_done; }
    time_t clock() const { return m_wallclock; }
-	void issue(){
-		m_issued++;
-	}
-	unsigned int num_issued() const{
-		return m_issued;
-	}
+   void issue(){ m_issued++; }
+   unsigned int num_issued() const{ return m_issued; }
 private:
    int m_uid;
    bool m_blocking;
    bool m_done;
    int m_updates;
-	unsigned int m_issued;
+   unsigned int m_issued;
    time_t m_wallclock;
    double m_gpu_tot_sim_cycle;
 
@@ -275,7 +272,7 @@ class stream_manager {
     CUstream_st m_stream_zero;
     bool m_service_stream_zero;
     pthread_mutex_t m_lock;
-	 std::list<struct CUstream_st*>::iterator m_last_stream;
+    std::list<struct CUstream_st*>::iterator m_last_stream;
 };
 
 #endif

From f3ec23390a0798eab1426adf962487680ea89e93 Mon Sep 17 00:00:00 2001
From: boyealkim <gangmul12@snu.ac.kr>
Date: Sat, 27 Jul 2019 03:54:44 +0900
Subject: [PATCH 4/4] fix event done impl

---
 src/stream_manager.cc |  2 +-
 src/stream_manager.h  | 80 +++++++++++++++++++++----------------------
 2 files changed, 41 insertions(+), 41 deletions(-)

diff --git a/src/stream_manager.cc b/src/stream_manager.cc
index d0eecddba..0d67e108c 100644
--- a/src/stream_manager.cc
+++ b/src/stream_manager.cc
@@ -198,7 +198,7 @@ bool stream_operation::do_operation( gpgpu_sim *gpu )
         //only allows next op to go if event is done
         //otherwise stays in the stream queue
         printf("stream wait event processing...\n");
-        if(m_event->done()){
+        if(m_event->num_updates()>=m_cnt){
             printf("stream wait event done\n");
             m_stream->record_next_done();
         }
diff --git a/src/stream_manager.h b/src/stream_manager.h
index eb4c6ff06..e73d71a9d 100644
--- a/src/stream_manager.h
+++ b/src/stream_manager.h
@@ -43,6 +43,45 @@
 //    unsigned m_pending_streams;
 //};
 
+struct CUevent_st {
+public:
+   CUevent_st( bool blocking )
+   {
+      m_uid = ++m_next_event_uid;
+      m_blocking = blocking;
+      m_updates = 0;
+      m_wallclock = 0;
+      m_gpu_tot_sim_cycle = 0;
+      m_issued = 0;
+      m_done = false;
+   }
+   void update( double cycle, time_t clk )
+   {
+      m_updates++;
+      m_wallclock=clk;
+      m_gpu_tot_sim_cycle=cycle;
+      m_done = true;
+   }
+   //void set_done() { assert(!m_done); m_done=true; }
+   int get_uid() const { return m_uid; }
+   unsigned num_updates() const { return m_updates; }
+   bool done() const { return m_updates==m_issued; }
+   time_t clock() const { return m_wallclock; }
+   void issue(){ m_issued++; }
+   unsigned int num_issued() const{ return m_issued; }
+private:
+   int m_uid;
+   bool m_blocking;
+   bool m_done;
+   int m_updates;
+   unsigned int m_issued;
+   time_t m_wallclock;
+   double m_gpu_tot_sim_cycle;
+
+   static int m_next_event_uid;
+};
+
+
 enum stream_operation_type {
     stream_no_op,
     stream_memcpy_host_to_device,
@@ -107,6 +146,7 @@ class stream_operation {
         m_kernel=NULL;
         m_type=stream_wait_event;
         m_event=e;
+        m_cnt = m_event->num_issued();
         m_stream=stream;
         m_done=false;
     }
@@ -163,7 +203,6 @@ class stream_operation {
     void print( FILE *fp ) const;
     struct CUstream_st *get_stream() { return m_stream; }
     void set_stream( CUstream_st *stream ) { m_stream = stream; }
-
 private:
     struct CUstream_st *m_stream;
 
@@ -183,45 +222,6 @@ class stream_operation {
     kernel_info_t *m_kernel;
     struct CUevent_st *m_event;
 };
-
-struct CUevent_st {
-public:
-   CUevent_st( bool blocking )
-   {
-      m_uid = ++m_next_event_uid;
-      m_blocking = blocking;
-      m_updates = 0;
-      m_wallclock = 0;
-      m_gpu_tot_sim_cycle = 0;
-      m_issued = 0;
-      m_done = false;
-   }
-   void update( double cycle, time_t clk )
-   {
-      m_updates++;
-      m_wallclock=clk;
-      m_gpu_tot_sim_cycle=cycle;
-      m_done = true;
-   }
-   //void set_done() { assert(!m_done); m_done=true; }
-   int get_uid() const { return m_uid; }
-   unsigned num_updates() const { return m_updates; }
-   bool done() const { return m_done; }
-   time_t clock() const { return m_wallclock; }
-   void issue(){ m_issued++; }
-   unsigned int num_issued() const{ return m_issued; }
-private:
-   int m_uid;
-   bool m_blocking;
-   bool m_done;
-   int m_updates;
-   unsigned int m_issued;
-   time_t m_wallclock;
-   double m_gpu_tot_sim_cycle;
-
-   static int m_next_event_uid;
-};
-
 struct CUstream_st {
 public:
     CUstream_st();