@@ -292,10 +292,6 @@ static void llama_params_fit_impl(
292292 if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
293293 throw std::runtime_error (" changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort" );
294294 }
295- if (hp_ngl < 2 *nd) {
296- throw std::runtime_error (" model has only " + std::to_string (hp_ngl) + " layers but need at least "
297- + std::to_string (2 *nd) + " to fit memory for " + std::to_string (nd) + " devices, abort" );
298- }
299295 }
300296 if (!tensor_buft_overrides) {
301297 throw std::runtime_error (" did not provide buffer to set tensor_buft_overrides, abort" );
@@ -362,22 +358,17 @@ static void llama_params_fit_impl(
362358 auto set_ngl_tensor_split_tbo = [&](
363359 const std::vector<ngl_t > & ngl_per_device,
364360 const std::vector<ggml_backend_buffer_type_t > & overflow_bufts,
365- llama_model_params & mparams,
366- const bool add_nonrepeating) {
361+ llama_model_params & mparams) {
367362 mparams.n_gpu_layers = 0 ;
368363 for (size_t id = 0 ; id < nd; id++) {
369364 mparams.n_gpu_layers += ngl_per_device[id].n_layer ;
370365 if (nd > 1 ) {
371366 tensor_split[id] = ngl_per_device[id].n_layer ;
372367 }
373368 }
374- assert (uint32_t (mparams.n_gpu_layers ) <= hp_ngl);
375- uint32_t il0 = hp_ngl - mparams.n_gpu_layers ; // start index for tensor buft overrides
369+ assert (uint32_t (mparams.n_gpu_layers ) <= hp_ngl + 1 );
370+ uint32_t il0 = hp_ngl + 1 - mparams.n_gpu_layers ; // start index for tensor buft overrides
376371
377- if (add_nonrepeating) {
378- mparams.n_gpu_layers += 1 ;
379- tensor_split[nd - 1 ] += 1 ;
380- }
381372 mparams.tensor_split = tensor_split;
382373
383374 size_t itbo = 0 ;
@@ -408,10 +399,9 @@ static void llama_params_fit_impl(
408399 auto get_memory_for_layers = [&](
409400 const char * func_name,
410401 const std::vector<ngl_t > & ngl_per_device,
411- const std::vector<ggml_backend_buffer_type_t > & overflow_bufts,
412- const bool add_nonrepeating) -> std::vector<int64_t > {
402+ const std::vector<ggml_backend_buffer_type_t > & overflow_bufts) -> std::vector<int64_t > {
413403 llama_model_params mparams_copy = *mparams;
414- set_ngl_tensor_split_tbo (ngl_per_device, overflow_bufts, mparams_copy, add_nonrepeating );
404+ set_ngl_tensor_split_tbo (ngl_per_device, overflow_bufts, mparams_copy);
415405
416406 const dmds_t dmd_nl = llama_get_device_memory_data (
417407 path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
@@ -469,9 +459,6 @@ static void llama_params_fit_impl(
469459 LLAMA_LOG_DEBUG (" %s: id=%zu, target=%" PRId64 " MiB\n " , __func__, id, targets[id]/MiB);
470460 }
471461
472- // whether for the optimal memory use we expect to load at least some MoE tensors:
473- const bool partial_moe = hp_nex > 0 && global_surplus_cpu_moe > 0 ;
474-
475462 std::vector<ggml_backend_buffer_type_t > overflow_bufts; // which bufts the partial layers of a device overflow to:
476463 overflow_bufts.reserve (nd);
477464 for (size_t id = 0 ; id < nd - 1 ; ++id) {
@@ -480,7 +467,7 @@ static void llama_params_fit_impl(
480467 overflow_bufts.push_back (ggml_backend_cpu_buffer_type ());
481468
482469 std::vector<ngl_t > ngl_per_device (nd);
483- std::vector<int64_t > mem = get_memory_for_layers (__func__, ngl_per_device, overflow_bufts, partial_moe );
470+ std::vector<int64_t > mem = get_memory_for_layers (__func__, ngl_per_device, overflow_bufts);
484471 if (hp_nex > 0 ) {
485472 for (size_t id = 0 ; id < nd; id++) {
486473 ngl_per_device[id].overflow_type = LAYER_FRACTION_MOE;
@@ -493,13 +480,14 @@ static void llama_params_fit_impl(
493480 // - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target
494481 // - check memory use of our guess, replace either the low or high bound
495482 // - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
483+ // - the last device has the output layer, which cannot be a partial layer
496484 if (hp_nex == 0 ) {
497485 LLAMA_LOG_INFO (" %s: filling dense layers back-to-front:\n " , __func__);
498486 } else {
499487 LLAMA_LOG_INFO (" %s: filling dense-only layers back-to-front:\n " , __func__);
500488 }
501489 for (int id = nd - 1 ; id >= 0 ; id--) {
502- uint32_t n_unassigned = hp_ngl;
490+ uint32_t n_unassigned = hp_ngl + 1 ;
503491 for (size_t jd = id + 1 ; jd < nd; ++jd) {
504492 assert (n_unassigned >= ngl_per_device[jd].n_layer );
505493 n_unassigned -= ngl_per_device[jd].n_layer ;
@@ -508,10 +496,10 @@ static void llama_params_fit_impl(
508496 std::vector<ngl_t > ngl_per_device_high = ngl_per_device;
509497 ngl_per_device_high[id].n_layer = n_unassigned;
510498 if (hp_nex > 0 ) {
511- ngl_per_device_high[id].n_part = ngl_per_device_high[id].n_layer ;
499+ ngl_per_device_high[id].n_part = size_t (id) < nd - 1 ? ngl_per_device_high[id].n_layer : ngl_per_device_high[id]. n_layer - 1 ;
512500 }
513501 if (ngl_per_device_high[id].n_layer > 0 ) {
514- std::vector<int64_t > mem_high = get_memory_for_layers (__func__, ngl_per_device_high, overflow_bufts, partial_moe );
502+ std::vector<int64_t > mem_high = get_memory_for_layers (__func__, ngl_per_device_high, overflow_bufts);
515503 if (mem_high[id] > targets[id]) {
516504 assert (ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer );
517505 uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer ;
@@ -526,7 +514,7 @@ static void llama_params_fit_impl(
526514 if (hp_nex) {
527515 ngl_per_device_test[id].n_part += step_size;
528516 }
529- const std::vector<int64_t > mem_test = get_memory_for_layers (__func__, ngl_per_device_test, overflow_bufts, partial_moe );
517+ const std::vector<int64_t > mem_test = get_memory_for_layers (__func__, ngl_per_device_test, overflow_bufts);
530518
531519 if (mem_test[id] <= targets[id]) {
532520 ngl_per_device = ngl_per_device_test;
@@ -553,7 +541,7 @@ static void llama_params_fit_impl(
553541 __func__, dev_names[id].c_str (), ngl_per_device[id].n_layer , mem[id]/MiB, projected_margin/MiB);
554542 }
555543 if (hp_nex == 0 || global_surplus_cpu_moe <= 0 ) {
556- set_ngl_tensor_split_tbo (ngl_per_device, overflow_bufts, *mparams, partial_moe );
544+ set_ngl_tensor_split_tbo (ngl_per_device, overflow_bufts, *mparams);
557545 return ;
558546 }
559547
@@ -576,13 +564,13 @@ static void llama_params_fit_impl(
576564 for (size_t id = 0 ; id <= id_dense_start; id++) {
577565 std::vector<ngl_t > ngl_per_device_high = ngl_per_device;
578566 for (size_t jd = id_dense_start; jd < nd; jd++) {
579- const uint32_t n_layer_move = ngl_per_device_high[jd].n_layer ;
567+ const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd]. n_layer - 1 ;
580568 ngl_per_device_high[id].n_layer += n_layer_move;
581569 ngl_per_device_high[jd].n_layer -= n_layer_move;
582570 ngl_per_device_high[jd].n_part = 0 ;
583571 }
584572 size_t id_dense_start_high = nd - 1 ;
585- std::vector<int64_t > mem_high = get_memory_for_layers (__func__, ngl_per_device_high, overflow_bufts, partial_moe );
573+ std::vector<int64_t > mem_high = get_memory_for_layers (__func__, ngl_per_device_high, overflow_bufts);
586574
587575 if (mem_high[id] > targets[id]) {
588576 assert (ngl_per_device_high[id].n_layer >= ngl_per_device_high[id].n_part );
@@ -610,7 +598,7 @@ static void llama_params_fit_impl(
610598 break ;
611599 }
612600 }
613- const std::vector<int64_t > mem_test = get_memory_for_layers (__func__, ngl_per_device_test, overflow_bufts, partial_moe );
601+ const std::vector<int64_t > mem_test = get_memory_for_layers (__func__, ngl_per_device_test, overflow_bufts);
614602
615603 if (mem_test[id] <= targets[id]) {
616604 ngl_per_device = ngl_per_device_test;
@@ -637,7 +625,7 @@ static void llama_params_fit_impl(
637625 }
638626
639627 // try to fit at least part of one more layer
640- if (ngl_per_device[id_dense_start].n_layer > 0 ) {
628+ if (ngl_per_device[id_dense_start].n_layer > (id < nd - 1 ? 0 : 1 ) ) {
641629 std::vector<ngl_t > ngl_per_device_test = ngl_per_device;
642630 size_t id_dense_start_test = id_dense_start;
643631 ngl_per_device_test[id_dense_start_test].n_layer --;
@@ -649,7 +637,7 @@ static void llama_params_fit_impl(
649637 }
650638 ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
651639 LLAMA_LOG_DEBUG (" %s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n " , __func__);
652- std::vector<int64_t > mem_test = get_memory_for_layers (__func__, ngl_per_device_test, overflow_bufts, partial_moe );
640+ std::vector<int64_t > mem_test = get_memory_for_layers (__func__, ngl_per_device_test, overflow_bufts);
653641 if (mem_test[id] < targets[id]) {
654642 ngl_per_device = ngl_per_device_test;
655643 mem = mem_test;
@@ -659,7 +647,7 @@ static void llama_params_fit_impl(
659647
660648 ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
661649 LLAMA_LOG_DEBUG (" %s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n " , __func__);
662- mem_test = get_memory_for_layers (__func__, ngl_per_device_test, overflow_bufts, partial_moe );
650+ mem_test = get_memory_for_layers (__func__, ngl_per_device_test, overflow_bufts);
663651 if (mem_test[id] < targets[id]) {
664652 ngl_per_device = ngl_per_device_test;
665653 mem = mem_test;
@@ -670,7 +658,7 @@ static void llama_params_fit_impl(
670658 } else {
671659 ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
672660 LLAMA_LOG_DEBUG (" %s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n " , __func__);
673- mem_test = get_memory_for_layers (__func__, ngl_per_device_test, overflow_bufts, partial_moe );
661+ mem_test = get_memory_for_layers (__func__, ngl_per_device_test, overflow_bufts);
674662 if (mem_test[id] < targets[id]) {
675663 ngl_per_device = ngl_per_device_test;
676664 mem = mem_test;
@@ -687,7 +675,7 @@ static void llama_params_fit_impl(
687675 __func__, dev_names[id].c_str (), ngl_per_device[id].n_layer , ngl_per_device[id].n_part , mem[id]/MiB, projected_margin/MiB);
688676 }
689677
690- set_ngl_tensor_split_tbo (ngl_per_device, overflow_bufts, *mparams, partial_moe );
678+ set_ngl_tensor_split_tbo (ngl_per_device, overflow_bufts, *mparams);
691679}
692680
693681bool llama_params_fit (
0 commit comments