@@ -1970,24 +1970,22 @@ std::vector<TensorStorage> remove_duplicates(const std::vector<TensorStorage>& v
19701970}
19711971
19721972bool ModelLoader::load_tensors (on_new_tensor_cb_t on_new_tensor_cb, int n_threads_p) {
1973- int64_t process_time_ms = 0 ;
1974- int64_t read_time_ms = 0 ;
1975- int64_t memcpy_time_ms = 0 ;
1976- int64_t copy_to_backend_time_ms = 0 ;
1977- int64_t convert_time_ms = 0 ;
1978-
1979- int64_t prev_time_ms = 0 ;
1980- int64_t curr_time_ms = 0 ;
1981- int64_t start_time = ggml_time_ms ();
1982- prev_time_ms = start_time;
1973+ int64_t process_time_ms = 0 ;
1974+ std::atomic<int64_t > read_time_ms (0 );
1975+ std::atomic<int64_t > memcpy_time_ms (0 );
1976+ std::atomic<int64_t > copy_to_backend_time_ms (0 );
1977+ std::atomic<int64_t > convert_time_ms (0 );
1978+
1979+ int num_threads_to_use = n_threads_p > 0 ? n_threads_p : (int )std::thread::hardware_concurrency ();
1980+
1981+ int64_t start_time = ggml_time_ms ();
19831982 std::vector<TensorStorage> processed_tensor_storages;
19841983
19851984 {
19861985 std::unordered_map<std::string, TensorStorage> processed_map;
19871986 std::mutex map_mutex;
19881987
1989- int num_threads = n_threads_p > 0 ? n_threads_p : (int )std::thread::hardware_concurrency ();
1990- int n_threads = std::min (num_threads, (int )tensor_storages.size ());
1988+ int n_threads = std::min (num_threads_to_use, (int )tensor_storages.size ());
19911989 if (n_threads < 1 ) {
19921990 n_threads = 1 ;
19931991 }
@@ -2028,14 +2026,13 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
20282026 }
20292027 }
20302028
2031- curr_time_ms = ggml_time_ms ();
2032- process_time_ms = curr_time_ms - prev_time_ms;
2033- prev_time_ms = curr_time_ms;
2029+ process_time_ms = ggml_time_ms () - start_time;
20342030
20352031 bool success = true ;
20362032 size_t total_tensors_processed = 0 ;
20372033 const size_t total_tensors_to_process = processed_tensor_storages.size ();
20382034 const int64_t t_start = ggml_time_ms ();
2035+ int last_n_threads = 1 ;
20392036
20402037 for (size_t file_index = 0 ; file_index < file_paths_.size (); file_index++) {
20412038 std::string file_path = file_paths_[file_index];
@@ -2059,11 +2056,11 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
20592056 }
20602057 }
20612058
2062- int num_threads = n_threads_p > 0 ? n_threads_p : (int )std::thread::hardware_concurrency ();
2063- int n_threads = is_zip ? 1 : std::min (num_threads, (int )file_tensors.size ());
2059+ int n_threads = is_zip ? 1 : std::min (num_threads_to_use, (int )file_tensors.size ());
20642060 if (n_threads < 1 ) {
20652061 n_threads = 1 ;
20662062 }
2063+ last_n_threads = n_threads;
20672064
20682065 std::atomic<size_t > tensor_idx (0 );
20692066 std::atomic<bool > failed (false );
@@ -2093,6 +2090,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
20932090 std::vector<uint8_t > convert_buffer;
20942091
20952092 while (true ) {
2093+ int64_t t0, t1;
20962094 size_t idx = tensor_idx.fetch_add (1 );
20972095 if (idx >= file_tensors.size () || failed) {
20982096 break ;
@@ -2101,13 +2099,17 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
21012099 const TensorStorage& tensor_storage = *file_tensors[idx];
21022100 ggml_tensor* dst_tensor = NULL ;
21032101
2102+ t0 = ggml_time_ms ();
2103+
21042104 if (!on_new_tensor_cb (tensor_storage, &dst_tensor)) {
21052105 LOG_WARN (" process tensor failed: '%s'" , tensor_storage.name .c_str ());
21062106 failed = true ;
21072107 break ;
21082108 }
21092109
21102110 if (dst_tensor == NULL ) {
2111+ t1 = ggml_time_ms ();
2112+ read_time_ms.fetch_add (t1 - t0);
21112113 continue ;
21122114 }
21132115
@@ -2118,28 +2120,19 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
21182120 zip_entry_openbyindex (zip, tensor_storage.index_in_zip );
21192121 size_t entry_size = zip_entry_size (zip);
21202122 if (entry_size != n) {
2123+ int64_t t_memcpy_start;
21212124 read_buffer.resize (entry_size);
2122- prev_time_ms = ggml_time_ms ();
21232125 zip_entry_noallocread (zip, (void *)read_buffer.data (), entry_size);
2124- curr_time_ms = ggml_time_ms ();
2125- read_time_ms += curr_time_ms - prev_time_ms;
2126- prev_time_ms = curr_time_ms;
2126+ t_memcpy_start = ggml_time_ms ();
21272127 memcpy ((void *)buf, (void *)(read_buffer.data () + tensor_storage.offset ), n);
2128- curr_time_ms = ggml_time_ms ();
2129- memcpy_time_ms += curr_time_ms - prev_time_ms;
2128+ memcpy_time_ms.fetch_add (ggml_time_ms () - t_memcpy_start);
21302129 } else {
2131- prev_time_ms = ggml_time_ms ();
21322130 zip_entry_noallocread (zip, (void *)buf, n);
2133- curr_time_ms = ggml_time_ms ();
2134- read_time_ms += curr_time_ms - prev_time_ms;
21352131 }
21362132 zip_entry_close (zip);
21372133 } else {
2138- prev_time_ms = ggml_time_ms ();
21392134 file.seekg (tensor_storage.offset );
21402135 file.read (buf, n);
2141- curr_time_ms = ggml_time_ms ();
2142- read_time_ms += curr_time_ms - prev_time_ms;
21432136 if (!file) {
21442137 LOG_ERROR (" read tensor data failed: '%s'" , file_path.c_str ());
21452138 failed = true ;
@@ -2156,8 +2149,10 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
21562149 } else {
21572150 read_data ((char *)dst_tensor->data , nbytes_to_read);
21582151 }
2152+ t1 = ggml_time_ms ();
2153+ read_time_ms.fetch_add (t1 - t0);
21592154
2160- prev_time_ms = ggml_time_ms ();
2155+ t0 = ggml_time_ms ();
21612156 if (tensor_storage.is_bf16 ) {
21622157 // inplace op
21632158 bf16_to_f32_vec ((uint16_t *)dst_tensor->data , (float *)dst_tensor->data , tensor_storage.nelements ());
@@ -2172,13 +2167,15 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
21722167 } else if (tensor_storage.is_i64 ) {
21732168 i64_to_i32_vec ((int64_t *)read_buffer.data (), (int32_t *)dst_tensor->data , tensor_storage.nelements ());
21742169 }
2175- curr_time_ms = ggml_time_ms ();
2176- convert_time_ms += curr_time_ms - prev_time_ms ;
2170+ t1 = ggml_time_ms ();
2171+ convert_time_ms. fetch_add (t1 - t0) ;
21772172 } else {
21782173 read_buffer.resize (std::max (tensor_storage.nbytes (), tensor_storage.nbytes_to_read ()));
21792174 read_data ((char *)read_buffer.data (), nbytes_to_read);
2175+ t1 = ggml_time_ms ();
2176+ read_time_ms.fetch_add (t1 - t0);
21802177
2181- prev_time_ms = ggml_time_ms ();
2178+ t0 = ggml_time_ms ();
21822179 if (tensor_storage.is_bf16 ) {
21832180 // inplace op
21842181 bf16_to_f32_vec ((uint16_t *)read_buffer.data (), (float *)read_buffer.data (), tensor_storage.nelements ());
@@ -2195,17 +2192,17 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
21952192 // inplace op
21962193 i64_to_i32_vec ((int64_t *)read_buffer.data (), (int32_t *)read_buffer.data (), tensor_storage.nelements ());
21972194 }
2198-
2199- convert_tensor ((void *)read_buffer.data (), tensor_storage.type , dst_tensor->data ,
2200- dst_tensor->type , (int )tensor_storage.nelements () / (int )tensor_storage.ne [0 ], (int )tensor_storage.ne [0 ]);
2201- curr_time_ms = ggml_time_ms ();
2202- convert_time_ms += curr_time_ms - prev_time_ms;
2195+ convert_tensor ((void *)read_buffer.data (), tensor_storage.type , dst_tensor->data , dst_tensor->type , (int )tensor_storage.nelements () / (int )tensor_storage.ne [0 ], (int )tensor_storage.ne [0 ]);
2196+ t1 = ggml_time_ms ();
2197+ convert_time_ms.fetch_add (t1 - t0);
22032198 }
22042199 } else {
22052200 read_buffer.resize (std::max (tensor_storage.nbytes (), tensor_storage.nbytes_to_read ()));
22062201 read_data ((char *)read_buffer.data (), nbytes_to_read);
2202+ t1 = ggml_time_ms ();
2203+ read_time_ms.fetch_add (t1 - t0);
22072204
2208- prev_time_ms = ggml_time_ms ();
2205+ t0 = ggml_time_ms ();
22092206 if (tensor_storage.is_bf16 ) {
22102207 // inplace op
22112208 bf16_to_f32_vec ((uint16_t *)read_buffer.data (), (float *)read_buffer.data (), tensor_storage.nelements ());
@@ -2229,20 +2226,18 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
22292226 convert_time_ms += curr_time_ms - prev_time_ms;
22302227 prev_time_ms = curr_time_ms;
22312228 ggml_backend_tensor_set (dst_tensor, read_buffer.data (), 0 , ggml_nbytes (dst_tensor));
2232- curr_time_ms = ggml_time_ms ();
2233- copy_to_backend_time_ms += curr_time_ms - prev_time_ms ;
2229+ t1 = ggml_time_ms ();
2230+ copy_to_backend_time_ms. fetch_add (t1 - t0) ;
22342231 } else {
22352232 // convert first, then copy to device memory
22362233 convert_buffer.resize (ggml_nbytes (dst_tensor));
2237- convert_tensor ((void *)read_buffer.data (), tensor_storage.type ,
2238- (void *)convert_buffer.data (), dst_tensor->type ,
2239- (int )tensor_storage.nelements () / (int )tensor_storage.ne [0 ], (int )tensor_storage.ne [0 ]);
2240- curr_time_ms = ggml_time_ms ();
2241- convert_time_ms += curr_time_ms - prev_time_ms;
2242- prev_time_ms = curr_time_ms;
2234+ convert_tensor ((void *)read_buffer.data (), tensor_storage.type , (void *)convert_buffer.data (), dst_tensor->type , (int )tensor_storage.nelements () / (int )tensor_storage.ne [0 ], (int )tensor_storage.ne [0 ]);
2235+ t1 = ggml_time_ms ();
2236+ convert_time_ms.fetch_add (t1 - t0);
2237+ t0 = ggml_time_ms ();
22432238 ggml_backend_tensor_set (dst_tensor, convert_buffer.data (), 0 , ggml_nbytes (dst_tensor));
2244- curr_time_ms = ggml_time_ms ();
2245- copy_to_backend_time_ms += curr_time_ms - prev_time_ms ;
2239+ t1 = ggml_time_ms ();
2240+ copy_to_backend_time_ms. fetch_add (t1 - t0) ;
22462241 }
22472242 }
22482243 }
@@ -2281,10 +2276,10 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
22812276 LOG_INFO (" loading tensors completed, taking %.2fs (process: %.2fs, read: %.2fs, memcpy: %.2fs, convert: %.2fs, copy_to_backend: %.2fs)" ,
22822277 (end_time - start_time) / 1000 .f ,
22832278 process_time_ms / 1000 .f ,
2284- read_time_ms / 1000 .f ,
2285- memcpy_time_ms / 1000 .f ,
2286- convert_time_ms / 1000 .f ,
2287- copy_to_backend_time_ms / 1000 .f );
2279+ ( read_time_ms. load () / ( float )last_n_threads) / 1000 .f ,
2280+ ( memcpy_time_ms. load () / ( float )last_n_threads) / 1000 .f ,
2281+ ( convert_time_ms. load () / ( float )last_n_threads) / 1000 .f ,
2282+ ( copy_to_backend_time_ms. load () / ( float )last_n_threads) / 1000 .f );
22882283 return success;
22892284}
22902285
0 commit comments