diff --git a/.github/workflows/ompi_mpi4py.yaml b/.github/workflows/ompi_mpi4py.yaml index 29abbcaf70d..bce5ed4f1df 100644 --- a/.github/workflows/ompi_mpi4py.yaml +++ b/.github/workflows/ompi_mpi4py.yaml @@ -20,10 +20,17 @@ permissions: jobs: test: - runs-on: ubuntu-22.04 + # We need Unbuntu 24.04 (over 22.04) due to a kernel bug, + # see https://github.com/google/sanitizers/issues/856. + runs-on: ubuntu-24.04 timeout-minutes: 30 env: MPI4PY_TEST_SPAWN: true + # disable ASAN while building + ASAN_OPTIONS: verify_asan_link_order=0,detect_odr_violation=0,abort_on_error=0 + # disable leak detection + LSAN_OPTIONS: detect_leaks=0,exitcode=0 + steps: - name: Configure hostname run: echo 127.0.0.1 `hostname` | sudo tee -a /etc/hosts > /dev/null @@ -31,7 +38,7 @@ jobs: - name: Install depencencies run: sudo apt-get install -y -q - libnuma-dev + libnuma-dev libasan8 if: ${{ runner.os == 'Linux' }} - name: Checkout Open MPI @@ -59,7 +66,8 @@ jobs: --disable-oshmem --disable-silent-rules --prefix=/opt/openmpi - LDFLAGS=-Wl,-rpath,/opt/openmpi/lib + CFLAGS="-O2 -fno-omit-frame-pointer -g -fsanitize=address" + LDFLAGS="-Wl,-rpath,/opt/openmpi/lib -fsanitize=address" working-directory: mpi-build - name: Build MPI @@ -115,6 +123,21 @@ jobs: env: CFLAGS: "-O0" + - name: Setting up ASAN environment + # LD_PRELOAD is needed to make sure ASAN is the first thing loaded + # as it will otherwise complain. + # Leak detection is currently disabled because of the size of the report. + # The patcher is disabled because ASAN fails if code mmaps data at fixed + # memory addresses, see https://github.com/open-mpi/ompi/issues/12819. + # ODR violation detection is disabled until #13469 is fixed + # Disabling stack use after return detection to reduce slowdown, per + # https://github.com/llvm/llvm-project/issues/64190. + run: | + echo LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libasan.so.8 >> $GITHUB_ENV + echo ASAN_OPTIONS=detect_odr_violation=0,abort_on_error=1,detect_stack_use_after_return=0 >> $GITHUB_ENV + echo LSAN_OPTIONS=detect_leaks=0,exitcode=0 >> $GITHUB_ENV + echo OMPI_MCA_memory=^patcher >> $GITHUB_ENV + - name: Test mpi4py (singleton) run: python test/main.py -v -x TestExcErrhandlerNull if: ${{ true }} @@ -145,6 +168,18 @@ jobs: if: ${{ true }} timeout-minutes: 10 + - name: Show MPI (ASAN) + run: ompi_info + + - name: Show MPICC (ASAN) + run: mpicc -show + + - name: Disabling ASAN environment + run: | + echo LD_PRELOAD= >> $GITHUB_ENV + echo ASAN_OPTIONS=verify_asan_link_order=0,detect_odr_violation=0,abort_on_error=0 >> $GITHUB_ENV + echo LSAN_OPTIONS=detect_leaks=0,exitcode=0 >> $GITHUB_ENV + - name: Relocate Open MPI installation run: mv /opt/openmpi /opt/ompi - name: Update PATH and set OPAL_PREFIX and LD_LIBRARY_PATH @@ -157,4 +192,3 @@ jobs: run: python test/main.py -v -x TestExcErrhandlerNull if: ${{ true }} timeout-minutes: 10 - diff --git a/.gitignore b/.gitignore index 4e71ce71bce..7ab0b99af7d 100644 --- a/.gitignore +++ b/.gitignore @@ -517,6 +517,7 @@ docs/_static docs/_static/css/custom.css docs/_templates docs/man-openmpi/man3/bindings +docs/*.inv # Common Python virtual environment and cache directory names venv diff --git a/3rd-party/prrte b/3rd-party/prrte index 5ad79eb2850..d4dffd7d9a5 160000 --- a/3rd-party/prrte +++ b/3rd-party/prrte @@ -1 +1 @@ -Subproject commit 5ad79eb285023d1dcca472ccba9de5987b51cc27 +Subproject commit d4dffd7d9a5f36824cebc71fbf5086f73a78fe59 diff --git a/config/ompi_setup_prrte.m4 b/config/ompi_setup_prrte.m4 index 2a56421e146..79a0f35e35e 100644 --- a/config/ompi_setup_prrte.m4 +++ b/config/ompi_setup_prrte.m4 @@ -19,7 +19,7 @@ dnl Copyright (c) 2019-2020 Intel, Inc. All rights reserved. dnl Copyright (c) 2020-2022 Amazon.com, Inc. or its affiliates. All Rights reserved. dnl Copyright (c) 2021 Nanook Consulting. All rights reserved. dnl Copyright (c) 2021-2022 IBM Corporation. All rights reserved. -dnl Copyright (c) 2023-2024 Jeffrey M. Squyres. All rights reserved. +dnl Copyright (c) 2023-2025 Jeffrey M. Squyres. All rights reserved. dnl Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. dnl $COPYRIGHT$ dnl @@ -39,7 +39,8 @@ dnl results of the build. AC_DEFUN([OMPI_SETUP_PRRTE],[ AC_REQUIRE([AC_PROG_LN_S]) -OPAL_VAR_SCOPE_PUSH([prrte_setup_internal_happy prrte_setup_external_happy target_rst_dir]) + OPAL_VAR_SCOPE_PUSH([prrte_setup_internal_happy prrte_setup_external_happy target_rst_dir ompi_external_prrte_docs_url]) + ompi_external_prrte_docs_url="https://docs.prrte.org/en/latest/" opal_show_subtitle "Configuring PRRTE" @@ -120,6 +121,8 @@ OPAL_VAR_SCOPE_PUSH([prrte_setup_internal_happy prrte_setup_external_happy targe AC_SUBST(OMPI_PRRTE_RST_CONTENT_DIR) AC_SUBST(OMPI_SCHIZO_OMPI_RST_CONTENT_DIR) + AC_SUBST(OMPI_PRRTE_DOCS_URL_BASE) + AC_SUBST(OMPI_USING_INTERNAL_PRRTE) AM_CONDITIONAL(OMPI_HAVE_PRRTE_RST, [test $OMPI_HAVE_PRRTE_RST -eq 1]) AS_IF([test "$OMPI_USING_INTERNAL_PRRTE" = "1"], @@ -250,8 +253,30 @@ AC_DEFUN([_OMPI_SETUP_PRRTE_INTERNAL], [ [OMPI_HAVE_PRRTE_RST=1 OMPI_PRRTE_RST_CONTENT_DIR="$OMPI_TOP_SRCDIR/3rd-party/prrte/src/docs/prrte-rst-content" OMPI_SCHIZO_OMPI_RST_CONTENT_DIR="$OMPI_TOP_SRCDIR/3rd-party/prrte/src/mca/schizo/ompi" + + # If we're building the OMPI Sphinx docs, and also + # building the internal PRRTE, then we're *also* + # building the internal PRRTE docs. + # + # In this case, the OMPI docs/conf.py will do a + # bunch of processing that is a lot easier to do in + # Python than Bourne shell (e.g., use the convenient + # os.path.relpath() to compute the relative path + # that we need, as well as dynamically create a + # Sphinx link inventory file). Hence, we skip doing + # all that work here and just set a sentinel value + OMPI_PRRTE_DOCS_URL_BASE="../../prrte/html" AC_MSG_RESULT([found])], - [AC_MSG_RESULT([not found])]) + [ # If we are not building the Sphinx docs, default + # to using the external PRRTE docs URL. This is + # actually moot because we won't be building the + # docs, but we might as well be complete in the + # logic / cases. + OMPI_PRRTE_DOCS_URL_BASE=$ompi_external_prrte_docs_url + AC_MSG_RESULT([not found])]) + + AC_MSG_CHECKING([for internal PRRTE docs link URL base]) + AC_MSG_RESULT([$OMPI_PRRTE_DOCS_URL_BASE]) $1], [$2]) @@ -273,7 +298,7 @@ dnl _OMPI_SETUP_PRRTE_EXTERNAL([action if success], [action if not success]) dnl dnl Try to find an external prrte with sufficient version. AC_DEFUN([_OMPI_SETUP_PRRTE_EXTERNAL], [ - OPAL_VAR_SCOPE_PUSH([ompi_prte_min_version ompi_prte_min_num_version setup_prrte_external_happy opal_prrte_CPPFLAGS_save]) + OPAL_VAR_SCOPE_PUSH([ompi_prte_min_version ompi_prte_min_num_version setup_prrte_external_happy opal_prrte_CPPFLAGS_save ompi_prrte_docdir]) opal_prrte_CPPFLAGS_save=$CPPFLAGS @@ -321,6 +346,10 @@ AC_DEFUN([_OMPI_SETUP_PRRTE_EXTERNAL], [ [ # Determine if this external PRRTE has installed the RST # directories that we care about + # In the external case, initially assume we'll use the + # web-based docs + OMPI_PRRTE_DOCS_URL_BASE=$ompi_external_prrte_docs_url + AC_MSG_CHECKING([for external PRRTE RST files]) prrte_install_dir=${with_prrte}/share/prte/rst AS_IF([test -n "$SPHINX_BUILD"], @@ -329,6 +358,17 @@ AC_DEFUN([_OMPI_SETUP_PRRTE_EXTERNAL], [ [OMPI_HAVE_PRRTE_RST=1 OMPI_PRRTE_RST_CONTENT_DIR="$prrte_install_dir/prrte-rst-content" OMPI_SCHIZO_OMPI_RST_CONTENT_DIR="$prrte_install_dir/schizo-ompi-rst-content" + # If the external PRTE docs dir exists where + # a simple heuristic thinks it should be + # (i.e., the default docdir location), use + # it. This will be an absolute path, which + # is fine (because we're building against an + # external PRRTE). If we don't find it, + # we'll fall back to the above-set HTTPS + # internet PRRTE docs URL. + ompi_prrte_docdir="$with_prrte/share/doc/prrte/html" + AS_IF([test -d "$ompi_prrte_docdir"], + [OMPI_PRRTE_DOCS_URL_BASE="$ompi_prrte_docdir"]) AC_MSG_RESULT([found]) ], [ # This version of PRRTE doesn't have installed RST @@ -336,6 +376,9 @@ AC_DEFUN([_OMPI_SETUP_PRRTE_EXTERNAL], [ AC_MSG_RESULT([not found]) ]) ]) + + AC_MSG_CHECKING([for external PRRTE docs link URL base]) + AC_MSG_RESULT([$OMPI_PRRTE_DOCS_URL_BASE]) $1], [$2]) diff --git a/config/opal_config_pmix.m4 b/config/opal_config_pmix.m4 index bea801c335e..0bff14c77b3 100644 --- a/config/opal_config_pmix.m4 +++ b/config/opal_config_pmix.m4 @@ -21,6 +21,7 @@ dnl Copyright (c) 2020 Triad National Security, LLC. All rights dnl reserved. dnl Copyright (c) 2020-2022 Amazon.com, Inc. or its affiliates. All Rights reserved. dnl Copyright (c) 2021 Nanook Consulting. All rights reserved. +dnl Copyright (c) 2025 Jeffrey M. Squyres. All rights reserved. dnl $COPYRIGHT$ dnl dnl Additional copyrights may follow @@ -57,7 +58,8 @@ dnl other execution tests later in configure (there are sadly dnl some) would fail if the path in LDFLAGS was not added to dnl LD_LIBRARY_PATH. AC_DEFUN([OPAL_CONFIG_PMIX], [ - OPAL_VAR_SCOPE_PUSH([external_pmix_happy internal_pmix_happy internal_pmix_args internal_pmix_wrapper_libs internal_pmix_CPPFLAGS opal_pmix_STATIC_LDFLAGS opal_pmix_LIBS opal_pmix_STATIC_LIBS]) + OPAL_VAR_SCOPE_PUSH([external_pmix_happy internal_pmix_happy internal_pmix_args internal_pmix_wrapper_libs internal_pmix_CPPFLAGS opal_pmix_STATIC_LDFLAGS opal_pmix_LIBS opal_pmix_STATIC_LIBS opal_external_pmix_docs_url]) + opal_external_pmix_docs_url="https://docs.openpmix.org/en/latest/" opal_show_subtitle "Configuring PMIx" @@ -154,6 +156,8 @@ AC_DEFUN([OPAL_CONFIG_PMIX], [ AC_DEFINE_UNQUOTED([OPAL_USING_INTERNAL_PMIX], [$OPAL_USING_INTERNAL_PMIX], [Whether or not we are using the internal PMIx]) + AC_SUBST(OPAL_PMIX_DOCS_URL_BASE) + AC_SUBST(OPAL_USING_INTERNAL_PMIX) OPAL_SUMMARY_ADD([Miscellaneous], [pmix], [], [$opal_pmix_mode]) @@ -216,8 +220,22 @@ AC_DEFUN([_OPAL_CONFIG_PMIX_EXTERNAL], [ dnl it will screw up other tests (like the pthread tests) opal_pmix_BUILD_LIBS="${opal_pmix_LIBS}" + # If the external PMIx docs dir exists where + # a simple heuristic thinks it should be + # (i.e., the default docdir location), use + # it. This will be an absolute path, which + # is fine (because we're building against an + # external PMIx). If we don't find it, + # we'll fall back to the HTTPS internet PMIx + # docs URL. + opal_pmix_docdir="$with_pmix/share/doc/pmix/html" + AS_IF([test -d "$opal_pmix_docdir"], + [OPAL_PMIX_DOCS_URL_BASE="$opal_pmix_docdir"], + [OPAL_PMIX_DOCS_URL_BASE=$opal_external_pmix_docs_url]) + $1], - [$2])]) + [$2]) + ]) OPAL_VAR_SCOPE_POP ]) @@ -238,7 +256,7 @@ AC_DEFUN([_OPAL_CONFIG_PMIX_INTERNAL_POST], [ pmix_internal_happy=1 - dnl Don't pull LDFLAGS, because we don't have a good way to avoid + dnl Do not pull LDFLAGS, because we don't have a good way to avoid dnl a -L to our install directory, which can cause some weirdness dnl if there's an old OMPI install there. And it makes filtering dnl redundant flags easier. @@ -279,6 +297,31 @@ AC_DEFUN([_OPAL_CONFIG_PMIX_INTERNAL_POST], [ opal_pmix_BUILD_LIBS="$OMPI_TOP_BUILDDIR/3rd-party/openpmix/src/libpmix.la" + AS_IF([test -n "$SPHINX_BUILD"], + [ # If we're building the OMPI Sphinx docs, and also + # building the internal PMIx, then we're *also* + # building the internal PMIx docs. + # + # In this case, the OMPI docs/conf.py will do a + # bunch of processing that is a lot easier to do in + # Python than Bourne shell (e.g., use the convenient + # os.path.relpath() to compute the relative path + # that we need, as well as dynamically create a + # Sphinx link inventory file). Hence, we skip doing + # all that work here and just set a sentinel value + OPAL_PMIX_DOCS_URL_BASE="../../pmix/html" + AC_MSG_RESULT([found])], + [ # If we are not building the Sphinx docs, default + # to using the external PMIx docs URL. This is + # actually moot because we won't be building the + # docs, but we might as well be complete in the + # logic / cases. + OPAL_PMIX_DOCS_URL_BASE=$opal_external_pmix_docs_url + AC_MSG_RESULT([not found])]) + + AC_MSG_CHECKING([for internal PMIx docs link URL base]) + AC_MSG_RESULT([$OPAL_PMIX_DOCS_URL_BASE]) + OPAL_3RDPARTY_SUBDIRS="$OPAL_3RDPARTY_SUBDIRS openpmix" ]) diff --git a/contrib/update-my-copyright.pl b/contrib/update-my-copyright.pl index 2bfe9f36a49..3ac826bfa32 100755 --- a/contrib/update-my-copyright.pl +++ b/contrib/update-my-copyright.pl @@ -128,31 +128,13 @@ sub quiet_print { # Find the top-level source tree dir in a git repo my $start = cwd(); -my $top = $start; -while (! -d "$top/.git") { - chdir(".."); - $top = cwd(); - die "Can't find top-level repository directory" - if ($top eq "/"); -} -chdir($start); +my $top = `git rev-parse --show-toplevel`; +chomp($top); quiet_print "==> Top-level repository dir: $top\n"; quiet_print "==> Current directory: $start\n"; -# Select VCS used to obtain modification info. Choose in increasing priority -# order (last hit wins). -my $vcs; -$vcs = "git" - if (-d "$top/.git"); -$vcs = "hg" - if (-d "$top/.hg"); -$vcs = "svn" - if (-d "$top/.svn"); -$vcs = "manual" - if ("$my_manual_list" ne ""); - -my @files = find_modified_files($vcs); +my @files = find_modified_files(); if ($#files < 0) { quiet_print "No added / changed files -- nothing to do\n"; @@ -284,98 +266,65 @@ sub quiet_print { #------------------------------------------------------------------------------- -# Takes two arguments, the top level directory and the VCS method. Returns a -# list of file names (relative to pwd) which the VCS considers to be modified. +# Returns a list of file names (relative to pwd) which git considers +# to be modified. sub find_modified_files { - my $vcs = shift; my @files = (); - if ($vcs eq "git") { - # Number of path entries to remove from ${top}-relative paths. - # (--show-cdup either returns the empty string or sequence of "../" - # entries, always ending in a "/") - my $n_strip = scalar(split(m!/!, scalar(`git rev-parse --show-cdup`))) - 1; - - # "." restricts scope, but does not get us relative path names - my $cmd = "git status -z --porcelain --untracked-files=no ."; - quiet_print "==> Running: \"$cmd\"\n"; - my $lines = `$cmd`; - - # From git-status(1): - # X Y Meaning - # ------------------------------------------------- - # [MD] not updated - # M [ MD] updated in index - # A [ MD] added to index - # D [ M] deleted from index - # R [ MD] renamed in index - # C [ MD] copied in index - # [MARC] index and work tree matches - # [ MARC] M work tree changed since index - # [ MARC] D deleted in work tree - # ------------------------------------------------- - # D D unmerged, both deleted - # A U unmerged, added by us - # U D unmerged, deleted by them - # U A unmerged, added by them - # D U unmerged, deleted by us - # A A unmerged, both added - # U U unmerged, both modified - # ------------------------------------------------- - # ? ? untracked - # ------------------------------------------------- - foreach my $line (split /\x{00}/, $lines) { - my $keep = 0; - my ($s1, $s2, $fullname) = $line =~ m/^(.)(.) (.*)$/; - - # ignore all merge cases - next if ($s1 eq "D" and $s2 eq "D"); - next if ($s1 eq "A" and $s2 eq "A"); - next if ($s1 eq "U" or $s2 eq "U"); - - # only update for actually added/modified cases, no copies, - # renames, etc. - $keep = 1 if ($s1 eq "M" or $s2 eq "M"); - $keep = 1 if ($s1 eq "A"); - - if ($keep) { - my $relname = $fullname; - $relname =~ s!^([^/]*/){$n_strip}!!g; - - push @files, $relname - if (-f $relname); - } - } - } - elsif ($vcs eq "hg" or $vcs eq "svn") { - my $cmd = "$vcs st ."; - - # Run the command, parsing the output. Make a list of files that are - # added or modified. - quiet_print "==> Running: \"$cmd\"\n"; - open(CMD, "$cmd|") || die "Can't run command"; - while () { - chomp; - if ($_ =~ /^M/ || $_ =~ /^A/) { - my @tokens = split(/\s+/, $_); - # Handle output of both forms: - # M filenameA - # A + filenameB - my $filename = $tokens[1]; - $filename = $tokens[2] - if ($tokens[1] =~ /\+/); - # Don't bother saving directory names - push(@files, $filename) - if (-f $filename); - } + # Number of path entries to remove from ${top}-relative paths. + # (--show-cdup either returns the empty string or sequence of "../" + # entries, always ending in a "/") + my $n_strip = scalar(split(m!/!, scalar(`git rev-parse --show-cdup`))) - 1; + + # "." restricts scope, but does not get us relative path names + my $cmd = "git status -z --porcelain --untracked-files=no ."; + quiet_print "==> Running: \"$cmd\"\n"; + my $lines = `$cmd`; + + # From git-status(1): + # X Y Meaning + # ------------------------------------------------- + # [MD] not updated + # M [ MD] updated in index + # A [ MD] added to index + # D [ M] deleted from index + # R [ MD] renamed in index + # C [ MD] copied in index + # [MARC] index and work tree matches + # [ MARC] M work tree changed since index + # [ MARC] D deleted in work tree + # ------------------------------------------------- + # D D unmerged, both deleted + # A U unmerged, added by us + # U D unmerged, deleted by them + # U A unmerged, added by them + # D U unmerged, deleted by us + # A A unmerged, both added + # U U unmerged, both modified + # ------------------------------------------------- + # ? ? untracked + # ------------------------------------------------- + foreach my $line (split /\x{00}/, $lines) { + my $keep = 0; + my ($s1, $s2, $fullname) = $line =~ m/^(.)(.) (.*)$/; + + # ignore all merge cases + next if ($s1 eq "D" and $s2 eq "D"); + next if ($s1 eq "A" and $s2 eq "A"); + next if ($s1 eq "U" or $s2 eq "U"); + + # only update for actually added/modified cases, no copies, + # renames, etc. + $keep = 1 if ($s1 eq "M" or $s2 eq "M"); + $keep = 1 if ($s1 eq "A"); + + if ($keep) { + my $relname = $fullname; + $relname =~ s!^([^/]*/){$n_strip}!!g; + + push @files, $relname + if (-f $relname); } - close(CMD); - } - elsif ($vcs eq "manual") { - @files = split(/\n/, `cat $my_manual_list`); - } - else { - die "unknown VCS '$vcs', stopped"; } return @files; diff --git a/docs/Makefile.am b/docs/Makefile.am index ca620636990..871184eb01d 100644 --- a/docs/Makefile.am +++ b/docs/Makefile.am @@ -1095,8 +1095,22 @@ $(ALL_MAN_BUILT): cp -rpf "$(OMPI_PRRTE_RST_CONTENT_DIR)" "$(builddir)"; \ copied_dir=`basename $(OMPI_PRRTE_RST_CONTENT_DIR)`; \ chmod -R u+w "$(builddir)/$$copied_dir" - $(OMPI_V_SPHINX_HTML) OMPI_TOP_SRCDIR=$(top_srcdir) $(SPHINX_BUILD) -M html "$(builddir)" "$(OUTDIR)" $(SPHINX_OPTS) - $(OMPI_V_SPHINX_MAN) OMPI_TOP_SRCDIR=$(top_srcdir) $(SPHINX_BUILD) -M man "$(builddir)" "$(OUTDIR)" $(SPHINX_OPTS) + $(OMPI_V_SPHINX_HTML) \ + OMPI_TOP_SRCDIR="$(top_srcdir)" \ + OMPI_DOCDIR="$(docdir)" \ + OPAL_PMIX_DOCS_URL_BASE="$(OPAL_PMIX_DOCS_URL_BASE)" \ + OPAL_USING_INTERNAL_PMIX="$(OPAL_USING_INTERNAL_PMIX)" \ + OMPI_PRRTE_DOCS_URL_BASE="$(OMPI_PRRTE_DOCS_URL_BASE)" \ + OMPI_USING_INTERNAL_PRRTE="$(OMPI_USING_INTERNAL_PRRTE)" \ + $(SPHINX_BUILD) -M html "$(builddir)" "$(OUTDIR)" $(SPHINX_OPTS) + $(OMPI_V_SPHINX_HTML) \ + OMPI_TOP_SRCDIR="$(top_srcdir)" \ + OMPI_DOCDIR="$(docdir)" \ + OPAL_PMIX_DOCS_URL_BASE="$(OPAL_PMIX_DOCS_URL_BASE)" \ + OPAL_USING_INTERNAL_PMIX="$(OPAL_USING_INTERNAL_PMIX)" \ + OMPI_PRRTE_DOCS_URL_BASE="$(OMPI_PRRTE_DOCS_URL_BASE)" \ + OMPI_USING_INTERNAL_PRRTE="$(OMPI_USING_INTERNAL_PRRTE)" \ + $(SPHINX_BUILD) -M man "$(builddir)" "$(OUTDIR)" $(SPHINX_OPTS) # A useful rule to invoke manually to ensure that all of the external # HTML links we have are valid. Running this rule requires @@ -1117,6 +1131,7 @@ linkcheck: clean-local: rm -rf $(OUTDIR) rm -rf prrte-rst-content schizo-ompi-rst-content + rm -rf ompi-prrte-objects.inv opal-pmix-objects.inv if test "$(srcdir)" != "$(builddir)"; then \ len=`echo "$(srcdir)/" | wc -c`; \ for file in $(RST_SOURCE_FILES) $(IMAGE_SOURCE_FILES) $(TEXT_SOURCE_FILES) $(SPHINX_CONFIG); do \ diff --git a/docs/conf.py b/docs/conf.py index e443d693fd5..b6e85216a5f 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -17,6 +17,7 @@ import os import re import datetime +import sphobjinv as soi year = datetime.datetime.now().year @@ -28,9 +29,7 @@ # The docs/Makefile.am will set the env var OMPI_TOP_SRCDIR, because # we might be doing a VPATH build. -ompi_top_srcdir = '..' -if 'OMPI_TOP_SRCDIR' in os.environ: - ompi_top_srcdir = os.environ['OMPI_TOP_SRCDIR'] +ompi_top_srcdir = os.environ.get('OMPI_TOP_SRCDIR', '..') # Read an Open MPI-style VERSION file def read_version_file(path): @@ -126,8 +125,7 @@ def get_tarball_version(path, expr): # If we're building in an RTD environment for a tag or external (i.e., # PR), use the RTD version -- not what we just read from the VERSIONS # file. -key = 'READTHEDOCS' -if key in os.environ and os.environ[key] == 'True': +if os.environ.get('READTHEDOCS') == 'True': print("OMPI: found ReadTheDocs build environment") # Tell Jinja2 templates the build is running on Read the Docs @@ -178,8 +176,110 @@ def get_tarball_version(path, expr): 'recommonmark', "sphinx_rtd_theme", "sphinx.ext.extlinks", + "sphinx.ext.intersphinx", ] +########################################################################## + +# Map to external documentation: PMIx and PRRTE + +def _make_intersphinx_mapping(project, name, fallback_base, entries): + # If there is no PROJECT_NAME_DOCS_URL_BASE (e.g., in a ReadTheDocs + # build), then use the fallback_base. + key = f'{project}_{name}_DOCS_URL_BASE'.upper() + docs_url_base = os.environ.get(key, fallback_base).strip() + key = f'{project}_USING_INTERNAL_{name}'.upper() + using_internal = os.environ.get(key, '0').strip() + + if using_internal == '0': + # In this case, we're using some external URL base -- either on the + # filesystem or via https. Just use that directly. + inv_filename = None + else: + # In this case, we're using the internal (embedded) version of + # the project (e.g., PMIX or PRRTE). Two things: + # + # 1. The internal PMIX / PRRTE docs have not yet been installed. + # Hence, we have to build our own objects.inv file to use during + # this Sphinx build. + # 2. We have to use relative links because these links must work + # - in the installed tree + # - in a pre-built tarball of the docs (where we don't know + # the install prefix before building) + # NOTE: We specifically realize that these relative links won't + # work in the build tree (because the install tree and build + # tree will likely have different relative paths). We have + # decided that this is acceptable -- the installed tree is + # the more important case to get right. + + # Use the official Sphinx Object Inventory library to build + # an objects.inv file on the fly. We know we only need a few specific + # labels from the internal PMIX / PRRTE docs, so we will just + # build those specific entries. + # + # 1. Initialize an empty inventory. Since this is a fake inventory, + # the name/version values don't matter. + inv = soi.Inventory() + inv.project = f'Open MPI {project}-{name} documentation' + inv.version = '1.2.3' + + # 2. Make the single link that we need + # - name: the unique ID for the link + # - domain: either 'py' or 'std'; we want "std" for a label + # - role: 'class', 'func', 'doc', 'label', etc. + # - uri: the relative path to the HTML page + # - dispname: what shows up in the link text ('-' means same as name) + for label, uri in entries.items(): + inv.objects.append(soi.DataObjStr( + name=label, + domain='std', + role='label', + priority='-1', + uri=uri, + dispname='-' + )) + + # 3. Export to a compressed objects.inv file + text_data = inv.data_file(contract=True) + zlib_data = soi.compress(text_data) + inv_filename = f'{project}-{name}-objects.inv' + soi.writebytes(inv_filename, zlib_data) + + # 4. Finally, figure out: + # - the docdir/html for where OMPI html docs will be installed + # - the docdir/html for where this project_name HTML docs will be installed + # Then compute the relative path between them. This will be + # the URL base that we will use for intersphinx mapping. + + # OMPI HTML docdir + docdir = os.environ.get('OMPI_DOCDIR', '.') + docdir_html = os.path.join(docdir, 'html') + + # This project_name's HTML docdir + docdir_parent = os.path.dirname(docdir) + docdir_name_html = os.path.join(docdir_parent, name, 'html') + + # Compute the relative path between them + docs_url_base = os.path.relpath(docdir_name_html, start=docdir_html) + + global intersphinx_mapping + intersphinx_mapping[name] = (docs_url_base, inv_filename) + +intersphinx_mapping = {} +_make_intersphinx_mapping("opal", "pmix", "https://docs.openpmix.org/en/latest/", { + 'man1-pmix_info' : 'man/man1/pmix_info.1.html#pmix-info', + }) +_make_intersphinx_mapping("ompi", "prrte", "https://docs.prrte.org/en/latest/", { + 'man1-prte_info' : 'man/man1/ompi-prte_info.1.html#man1-prte-info', + }) + +# Sphinx defaults to automatically resolve *unresolved* labels using all your Intersphinx mappings. +# This behavior has unintended side-effects, namely that documentations local references can +# suddenly resolve to an external location. +# See also: +# https://www.sphinx-doc.org/en/master/usage/extensions/intersphinx.html#confval-intersphinx_disabled_reftypes +intersphinx_disabled_reftypes = ["*"] + # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] diff --git a/docs/launching-apps/gridengine.rst b/docs/launching-apps/gridengine.rst index 462cb26b61e..0fed2ea191b 100644 --- a/docs/launching-apps/gridengine.rst +++ b/docs/launching-apps/gridengine.rst @@ -17,7 +17,7 @@ Verify Grid Engine support command line switch to Open MPI's ``configure`` script. To verify if support for Grid Engine is configured into your Open MPI -installation, run ``prte_info`` as shown below and look for +installation, run :ref:`prte_info(1) ` as shown below and look for ``gridengine``. .. code-block:: @@ -30,8 +30,8 @@ installation, run ``prte_info`` as shown below and look for PMIx and PRRTE details from the end user, but this is one place that Open MPI is unable to hide the fact that PRRTE provides this functionality, not Open MPI. Hence, users need to use the - ``prte_info`` command to check for Grid Engine support (not - ``ompi_info``). + :ref:`prte_info(1) ` command to check for Grid Engine support (not + :ref:`ompi_info(1) `). Launching --------- @@ -40,7 +40,7 @@ When Grid Engine support is included, Open MPI will automatically detect when it is running inside SGE and will just "do the Right Thing." -Specifically, if you execute an ``mpirun`` command in a Grid Engine +Specifically, if you execute an :ref:`mpirun(1) ` command in a Grid Engine job, it will automatically use the Grid Engine mechanisms to launch and kill processes. There is no need to specify what nodes to run on |mdash| Open MPI will obtain this information directly from Grid @@ -231,13 +231,13 @@ Grid Engine job suspend / resume support ---------------------------------------- To suspend the job, you send a SIGTSTP (not SIGSTOP) signal to -``mpirun``. ``mpirun`` will catch this signal and forward it to the +:ref:`mpirun(1) `. :ref:`mpirun(1) ` will catch this signal and forward it to the ``mpi-hello-world`` as a SIGSTOP signal. To resume the job, you send -a SIGCONT signal to ``mpirun`` which will be caught and forwarded to +a SIGCONT signal to :ref:`mpirun(1) ` which will be caught and forwarded to the ``mpi-hello-world``. By default, this feature is not enabled. This means that both the -SIGTSTP and SIGCONT signals will simply be consumed by the ``mpirun`` +SIGTSTP and SIGCONT signals will simply be consumed by the :ref:`mpirun(1) ` process. To have them forwarded, you have to run the job with ``--mca orte_forward_job_control 1``. Here is an example on Solaris: diff --git a/docs/launching-apps/lsf.rst b/docs/launching-apps/lsf.rst index 7660ca2e2aa..159a85a84a4 100644 --- a/docs/launching-apps/lsf.rst +++ b/docs/launching-apps/lsf.rst @@ -6,7 +6,8 @@ Open MPI supports the LSF resource manager. Verify LSF support ------------------ -The ``prte_info`` command can be used to determine whether or not an +The :ref:`prte_info(1) ` +command can be used to determine whether or not an installed Open MPI includes LSF support: .. code-block:: @@ -27,8 +28,9 @@ installed. PMIx and PRRTE details from the end user, but this is one place that Open MPI is unable to hide the fact that PRRTE provides this functionality, not Open MPI. Hence, users need to use the - ``prte_info`` command to check for LSF support (not - ``ompi_info``). + :ref:`prte_info(1) ` + command to check for LSF support (not + :ref:`ompi_info(1) `). Launching --------- diff --git a/docs/launching-apps/pals.rst b/docs/launching-apps/pals.rst index 49e818acb91..aa44bfffce0 100644 --- a/docs/launching-apps/pals.rst +++ b/docs/launching-apps/pals.rst @@ -27,7 +27,8 @@ documentation :doc:`tm`. Verify PALS support ------------------- -The ``prte_info`` command can be used to determine whether or not an +The :ref:`prte_info(1) ` +command can be used to determine whether or not an installed Open MPI includes PALS support: .. code-block:: @@ -49,11 +50,12 @@ Using ``mpirun`` This section assumes there is PALS support in the PRRTE being used for the Open MPI installation. -When ``mpirun`` is launched in a PBS job, ``mpirun`` will +When :ref:`mpirun(1) ` is launched in a PBS job, +:ref:`mpirun(1) ` will automatically utilize the PALS infrastructure for launching and controlling the individual MPI processes. -.. note:: Using ``mpirun`` is the recommended method for launching Open +.. note:: Using :ref:`mpirun(1) ` is the recommended method for launching Open MPI jobs on HPE systems where PALS is available. This is primarily due to limitations in the PMIx server provided in PALS. @@ -75,7 +77,7 @@ Using PALS "direct launch" functionality ---------------------------------------- The HPE PALS 1.5.0 documentation states that it comes pre-built with PMIx support. -By default the PALS ``aprun`` launcher does not use PMIx. To use the launcher's +By default the PALS ``aprun(1)`` launcher does not use PMIx. To use the launcher's PMIx capabilities either the command line option ``--pmix=pmix`` needs to be set or the ``ALPS_PMI`` environment variable needs to be set to ``pmix``. @@ -89,4 +91,4 @@ or the ``ALPS_PMI`` environment variable needs to be set to ``pmix``. In these examples, four instances of the application are started, two instances per node. -See the PALS ``aprun`` man page for documentation on how to this command. +See the PALS ``aprun(1)`` man page for documentation on how to this command. diff --git a/docs/launching-apps/tm.rst b/docs/launching-apps/tm.rst index a19727ce983..8d3c2ebc134 100644 --- a/docs/launching-apps/tm.rst +++ b/docs/launching-apps/tm.rst @@ -7,7 +7,7 @@ managers. Verify PBS/Torque support ------------------------- -The ``prte_info`` command can be used to determine whether or not an +The :ref:`prte_info(1) ` command can be used to determine whether or not an installed Open MPI includes Torque/PBS Pro support: .. code-block:: @@ -28,8 +28,8 @@ installed. PMIx and PRRTE details from the end user, but this is one place that Open MPI is unable to hide the fact that PRRTE provides this functionality, not Open MPI. Hence, users need to use the - ``prte_info`` command to check for PBS/Torque support (not - ``ompi_info``). + :ref:`prte_info(1) ` command to check for PBS/Torque support (not + :ref:`ompi_info(1) `). Launching --------- @@ -37,7 +37,7 @@ Launching When properly configured, Open MPI obtains both the list of hosts and how many processes to start on each host from Torque / PBS Pro directly. Hence, it is unnecessary to specify the ``--hostfile``, -``--host``, or ``-n`` options to ``mpirun``. Open MPI will use +``--host``, or ``-n`` options to :ref:`mpirun(1) `. Open MPI will use PBS/Torque-native mechanisms to launch and kill processes (``ssh`` is not required). diff --git a/docs/mca.rst b/docs/mca.rst index fbaf7af2950..0c8256a4e55 100644 --- a/docs/mca.rst +++ b/docs/mca.rst @@ -530,8 +530,8 @@ Open MPI has a *large* number of MCA parameters available. Users can use the :ref:`ompi_info(1) ` command to see *all* available MCA parameters. -.. note:: Similarly, you can use the ``pmix_info(1)`` and - ``prte_info(1)`` commands to see all the MCA parameters +.. note:: Similarly, you can use the :ref:`pmix_info(1) ` and + :ref:`prte_info(1) ` commands to see all the MCA parameters available for the PMIx and PRRTE projects, respectively. The documentation for these commands are not included in the diff --git a/docs/requirements.txt b/docs/requirements.txt index 001c5f10adf..9fda5a43a4d 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -2,6 +2,7 @@ sphinx>=4.2.0 recommonmark docutils sphinx-rtd-theme +sphobjinv # These modules are needed for the pympistandard module when you are # running Python 3.6 (they became part of core Python in 3.7). We diff --git a/docs/tuning-apps/networking/tcp.rst b/docs/tuning-apps/networking/tcp.rst index 3858a24e9eb..7b87546596c 100644 --- a/docs/tuning-apps/networking/tcp.rst +++ b/docs/tuning-apps/networking/tcp.rst @@ -256,20 +256,17 @@ not use specific IP networks |mdash| or not use any IP networks at all .. warning:: If you use the ``btl_tcp_if_include`` and ``btl_tcp_if_exclude`` MCA parameters to shape the behavior of the TCP BTL for MPI communications, you may - also need/want to investigate the corresponding MCA - parameters ``oob_tcp_if_include`` and - ``oob_tcp_if_exclude``, which are used to shape non-MPI - TCP-based communication (e.g., communications setup and - coordination during ``MPI_INIT`` and ``MPI_FINALIZE``). - -.. error:: TODO do corresponding OOB TCP params still exist in PMIx? - -Note that Open MPI will still use TCP for control messages, such as -data between ``mpirun`` and the MPI processes, rendezvous information -during ``MPI_INIT``, etc. To disable TCP altogether, you also need to -disable the ``tcp`` component from the OOB framework. - -.. error:: TODO Is this possible in PMIx? I doubt it...? + also need/want to investigate the corresponding PRRTE + parameters that control use of network interfaces by the + runtime (e.g., communications setup and coordination + during :ref:`MPI_Init` and :ref:`MPI_Finalize`) using the + :ref:`prte_info(1) ` + and :ref:`pmix_info(1) ` commands. + +Note that the Open MPI runtime uses TCP for control messages, such as +for data exchange between ``mpirun(1)`` and the MPI processes, +rendezvous information during :ref:`MPI_Init`, etc. even if the +``tcp`` BTL component is disabled. ///////////////////////////////////////////////////////////////////////// diff --git a/ompi/communicator/ft/comm_ft_revoke.c b/ompi/communicator/ft/comm_ft_revoke.c index 81e0c7ceb98..adfadc5f9e4 100644 --- a/ompi/communicator/ft/comm_ft_revoke.c +++ b/ompi/communicator/ft/comm_ft_revoke.c @@ -55,7 +55,7 @@ int ompi_comm_revoke_internal(ompi_communicator_t* comm) OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), __func__, ompi_comm_print_cid(comm), comm->c_epoch )); /* Mark locally revoked */ - if( ompi_comm_revoke_local(comm, NULL) ) { + if( ompi_comm_revoke_local(comm, false) ) { /* Broadcast the 'revoke' signal to all other processes. */ ompi_comm_rbcast_message_t msg; msg.cid = ompi_comm_get_local_cid(comm); @@ -73,15 +73,15 @@ bool ompi_comm_revoke_local(ompi_communicator_t* comm, bool coll_only) { if( comm->comm_revoked || (coll_only && comm->coll_revoked) ) { OPAL_OUTPUT_VERBOSE((9, ompi_ftmpi_output_handle, - "%s %s: comm %s:%d is already %srevoked, nothing to do", + "%s %s: comm %s:%d is already %s revoked, nothing to do", OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), __func__, ompi_comm_print_cid(comm), comm->c_epoch, - coll_only ? "coll " : "")); + coll_only ? "coll" : "fully")); return false; } OPAL_OUTPUT_VERBOSE((9, ompi_ftmpi_output_handle, - "%s %s: comm %s:%d is marked %srevoked locally", + "%s %s: comm %s:%d is marked %s revoked locally", OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), __func__, ompi_comm_print_cid(comm), comm->c_epoch, - coll_only ? "coll " : "")); + coll_only ? "coll" : "fully")); /* * Locally revoke the communicator * diff --git a/ompi/instance/instance.c b/ompi/instance/instance.c index 8ca19a9724c..bd686d2bab2 100644 --- a/ompi/instance/instance.c +++ b/ompi/instance/instance.c @@ -835,6 +835,9 @@ int ompi_mpi_instance_init (int ts_level, opal_info_t *info, ompi_errhandler_t opal_set_using_threads(true); } + /* Set single-threaded flag for optimization purposes */ + opal_single_threaded = (ts_level == MPI_THREAD_SINGLE); + opal_mutex_lock (&instance_lock); if (0 == opal_atomic_fetch_add_32 (&ompi_instance_count, 1)) { ret = ompi_mpi_instance_init_common (argc, argv); diff --git a/ompi/mca/coll/acoll/coll_acoll_bcast.c b/ompi/mca/coll/acoll/coll_acoll_bcast.c index b0116fb2361..b1eaf03a17a 100644 --- a/ompi/mca/coll/acoll/coll_acoll_bcast.c +++ b/ompi/mca/coll/acoll/coll_acoll_bcast.c @@ -693,7 +693,7 @@ int mca_coll_acoll_bcast(void *buff, size_t count, struct ompi_datatype_t *datat return ompi_coll_base_bcast_intra_knomial(buff, count, datatype, root, comm, module, 0, 4); } } - if ((!subc->initialized || (root != subc->prev_init_root)) && size > 2) { + if (!subc->initialized || (root != subc->prev_init_root)) { err = mca_coll_acoll_comm_split_init(comm, acoll_module, subc, root); if (MPI_SUCCESS != err) { return err; @@ -704,13 +704,8 @@ int mca_coll_acoll_bcast(void *buff, size_t count, struct ompi_datatype_t *datat total_dsize = dsize * count; rank = ompi_comm_rank(comm); sg_cnt = acoll_module->sg_cnt; - if (size > 2) { - num_nodes = subc->num_nodes; - node_size = ompi_comm_size(subc->local_comm); - } else { - num_nodes = 1; - node_size = size; - } + num_nodes = subc->num_nodes; + node_size = ompi_comm_size(subc->local_comm); /* Use knomial for nodes 8 and above and non-large messages */ if (((num_nodes >= 8 && total_dsize <= 65536) @@ -727,9 +722,6 @@ int mca_coll_acoll_bcast(void *buff, size_t count, struct ompi_datatype_t *datat &use_numa, &use_socket, &use_shm, &lin_0, &lin_1, &lin_2, num_nodes, acoll_module, subc); no_sg = (sg_cnt == node_size) ? 1 : 0; - if (size <= 2) { - no_sg = 1; - } /* Disable shm based bcast if: */ /* - datatype is not a predefined type */ diff --git a/ompi/mca/coll/acoll/coll_acoll_reduce.c b/ompi/mca/coll/acoll/coll_acoll_reduce.c index ec0c07b6e7e..69da3cb49cf 100644 --- a/ompi/mca/coll/acoll/coll_acoll_reduce.c +++ b/ompi/mca/coll/acoll/coll_acoll_reduce.c @@ -63,7 +63,7 @@ static inline int coll_acoll_reduce_topo(const void *sbuf, void *rbuf, size_t co rank = ompi_comm_rank(comm); - int use_socket = 1; + int use_socket = (0 == acoll_module->use_socket) ? 1 : acoll_module->use_socket; tmp_sbuf = (char *) sbuf; if ((MPI_IN_PLACE == sbuf) && (rank == root)) { diff --git a/ompi/mca/coll/han/coll_han_subcomms.c b/ompi/mca/coll/han/coll_han_subcomms.c index a5208f36044..9fcd65dad9b 100644 --- a/ompi/mca/coll/han/coll_han_subcomms.c +++ b/ompi/mca/coll/han/coll_han_subcomms.c @@ -418,6 +418,11 @@ int mca_coll_han_comm_create(struct ompi_communicator_t *comm, int mca_coll_han_revoke_local(ompi_communicator_t *comm, mca_coll_base_module_t *module) { + // Note that this "coll" revokes the subcomms regardless of whether the + // parent comm is "coll" revoked or "fully" revoked, so it is important + // to only use collective tags on communication in these subcomms. Else, + // one should check the impact to the overall revocation process before + // changing these to "fully" revoking the subcomms. mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; for(int i = 0; i < NB_TOPO_LVL; i++){ if(NULL == han_module->sub_comm[i]) continue; @@ -430,9 +435,9 @@ int mca_coll_han_revoke_local(ompi_communicator_t *comm, } } if(han_module->cached_up_comms != NULL){ - for(int i = 0; i < COLL_HAN_LOW_MODULES; i++){ - if(NULL == han_module->cached_low_comms[i]) continue; - ompi_comm_revoke_local(han_module->cached_low_comms[i], true); + for(int i = 0; i < COLL_HAN_UP_MODULES; i++){ + if(NULL == han_module->cached_up_comms[i]) continue; + ompi_comm_revoke_local(han_module->cached_up_comms[i], true); } } return MPI_SUCCESS; diff --git a/ompi/mca/coll/tuned/coll_tuned.h b/ompi/mca/coll/tuned/coll_tuned.h index bb5ed3f762a..47634caec25 100644 --- a/ompi/mca/coll/tuned/coll_tuned.h +++ b/ompi/mca/coll/tuned/coll_tuned.h @@ -137,6 +137,7 @@ int ompi_coll_tuned_barrier_intra_check_forced_init (coll_tuned_force_algorithm_ /* Bcast */ int ompi_coll_tuned_bcast_intra_dec_fixed(BCAST_ARGS); +int ompi_coll_tuned_bcast_intra_disjoint_dec_fixed(BCAST_ARGS); int ompi_coll_tuned_bcast_intra_dec_dynamic(BCAST_ARGS); int ompi_coll_tuned_bcast_intra_do_this(BCAST_ARGS, int algorithm, int faninout, int segsize); int ompi_coll_tuned_bcast_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices); diff --git a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c index fa31aef1860..e97993ffe10 100644 --- a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c +++ b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c @@ -654,6 +654,79 @@ int ompi_coll_tuned_bcast_intra_dec_fixed(void *buff, size_t count, alg, 0, 0); } + +/* + * bcast_intra_dec for inter node communicators + * + * Function: - selects broadcast algorithm to use + * Accepts: - same arguments as MPI_Bcast() + * Returns: - MPI_SUCCESS or error code (passed from the bcast implementation) + */ +int ompi_coll_tuned_bcast_intra_disjoint_dec_fixed(void *buff, size_t count, + struct ompi_datatype_t *datatype, int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + size_t total_dsize, dsize; + int communicator_size, alg; + communicator_size = ompi_comm_size(comm); + + ompi_datatype_type_size(datatype, &dsize); + total_dsize = dsize * (unsigned long)count; + + OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_bcast_intra_disjoint_dec_fixed" + " root %d rank %d com_size %d", + root, ompi_comm_rank(comm), communicator_size)); + + /** Algorithms: + * {1, "basic_linear"}, + * {2, "chain"}, + * {3, "pipeline"}, + * {4, "split_binary_tree"}, + * {5, "binary_tree"}, + * {6, "binomial"}, + * {7, "knomial"}, + * {8, "scatter_allgather"}, + * {9, "scatter_allgather_ring"}, + */ + if (communicator_size < 4) { + alg = 1; + } else if (communicator_size < 8) { + if (total_dsize < 1048576) { + alg = 1; + } else { + alg = 5; + } + } else if (communicator_size < 16) { + if (total_dsize < 1048576) { + alg = 1; + } else { + alg = 5; + } + } else if (communicator_size < 32) { + if (total_dsize < 262144) { + alg = 1; + } else if (total_dsize < 1048576) { + alg = 7; + } else { + alg = 5; + } + } else { + if (total_dsize < 65536) { + alg = 1; + } else if (total_dsize < 1048576) { + alg = 7; + } else { + alg = 5; + } + } + + return ompi_coll_tuned_bcast_intra_do_this (buff, count, datatype, root, + comm, module, + alg, 0, 0); +} + + /* * reduce_intra_dec * diff --git a/ompi/mca/coll/tuned/coll_tuned_module.c b/ompi/mca/coll/tuned/coll_tuned_module.c index eb4fb125380..20bb4c4a49b 100644 --- a/ompi/mca/coll/tuned/coll_tuned_module.c +++ b/ompi/mca/coll/tuned/coll_tuned_module.c @@ -100,14 +100,20 @@ ompi_coll_tuned_comm_query(struct ompi_communicator_t *comm, int *priority) /* By default stick with the fixed version of the tuned collectives. Later on, * when the module get enabled, set the correct version based on the availability * of the dynamic rules. + * For some collectives, we distinguish between disjoint communicators to make + * decision specific for inter node communication. */ + if (OMPI_COMM_IS_DISJOINT_SET(comm) && OMPI_COMM_IS_DISJOINT(comm)) { + tuned_module->super.coll_bcast = ompi_coll_tuned_bcast_intra_disjoint_dec_fixed; + } else { + tuned_module->super.coll_bcast = ompi_coll_tuned_bcast_intra_dec_fixed; + } tuned_module->super.coll_allgather = ompi_coll_tuned_allgather_intra_dec_fixed; tuned_module->super.coll_allgatherv = ompi_coll_tuned_allgatherv_intra_dec_fixed; tuned_module->super.coll_allreduce = ompi_coll_tuned_allreduce_intra_dec_fixed; tuned_module->super.coll_alltoall = ompi_coll_tuned_alltoall_intra_dec_fixed; tuned_module->super.coll_alltoallv = ompi_coll_tuned_alltoallv_intra_dec_fixed; tuned_module->super.coll_barrier = ompi_coll_tuned_barrier_intra_dec_fixed; - tuned_module->super.coll_bcast = ompi_coll_tuned_bcast_intra_dec_fixed; tuned_module->super.coll_gather = ompi_coll_tuned_gather_intra_dec_fixed; tuned_module->super.coll_reduce = ompi_coll_tuned_reduce_intra_dec_fixed; tuned_module->super.coll_reduce_scatter = ompi_coll_tuned_reduce_scatter_intra_dec_fixed; diff --git a/ompi/mca/osc/ucx/osc_ucx_component.c b/ompi/mca/osc/ucx/osc_ucx_component.c index 93b0fb9c9a0..635a53a3e0f 100644 --- a/ompi/mca/osc/ucx/osc_ucx_component.c +++ b/ompi/mca/osc/ucx/osc_ucx_component.c @@ -190,6 +190,7 @@ static int component_register(void) { free(description_str); opal_common_ucx_thread_enabled = opal_using_threads(); + opal_common_ucx_single_threaded = opal_single_threaded; mca_osc_ucx_component.acc_single_intrinsic = false; opal_asprintf(&description_str, "Enable optimizations for MPI_Fetch_and_op, MPI_Accumulate, etc for codes " diff --git a/ompi/mca/pml/ucx/pml_ucx_request.c b/ompi/mca/pml/ucx/pml_ucx_request.c index 2c24d8d178c..532c89b51d6 100644 --- a/ompi/mca/pml/ucx/pml_ucx_request.c +++ b/ompi/mca/pml/ucx/pml_ucx_request.c @@ -236,6 +236,7 @@ static int mca_pml_ucx_persistent_request_free(ompi_request_t **rptr) ucp_request_free(tmp_req); } OMPI_DATATYPE_RELEASE(preq->ompi_datatype); + OMPI_REQUEST_FINI(&preq->ompi); PML_UCX_FREELIST_RETURN(&ompi_pml_ucx.persistent_reqs, &preq->ompi.super); *rptr = MPI_REQUEST_NULL; return OMPI_SUCCESS; diff --git a/opal/mca/btl/ofi/btl_ofi.h b/opal/mca/btl/ofi/btl_ofi.h index e12d490b390..20345d02c6b 100644 --- a/opal/mca/btl/ofi/btl_ofi.h +++ b/opal/mca/btl/ofi/btl_ofi.h @@ -139,6 +139,8 @@ struct mca_btl_ofi_module_t { /** registration cache */ mca_rcache_base_module_t *rcache; + + mca_btl_base_module_error_cb_fn_t ofi_error_cb; }; typedef struct mca_btl_ofi_module_t mca_btl_ofi_module_t; diff --git a/opal/mca/btl/ofi/btl_ofi_context.c b/opal/mca/btl/ofi/btl_ofi_context.c index 2b9a5fb6905..ea876d548c0 100644 --- a/opal/mca/btl/ofi/btl_ofi_context.c +++ b/opal/mca/btl/ofi/btl_ofi_context.c @@ -310,6 +310,56 @@ mca_btl_ofi_context_t *get_ofi_context_rr(mca_btl_ofi_module_t *btl) return &btl->contexts[rr_num++ % btl->num_contexts]; } +static void inline complete_op_context(mca_btl_ofi_context_t* context, + void *op_context, int rc) +{ + mca_btl_ofi_completion_context_t *c_ctx = + (mca_btl_ofi_completion_context_t*) op_context; + /* We are casting to every type here just for simplicity. */ + mca_btl_ofi_base_completion_t *comp = + (mca_btl_ofi_base_completion_t *) c_ctx->comp; + mca_btl_ofi_frag_completion_t *frag_comp = + (mca_btl_ofi_frag_completion_t *) c_ctx->comp; + mca_btl_ofi_rdma_completion_t *rdma_comp + = (mca_btl_ofi_rdma_completion_t *) c_ctx->comp; + + switch (comp->type) { + case MCA_BTL_OFI_TYPE_GET: + case MCA_BTL_OFI_TYPE_PUT: + case MCA_BTL_OFI_TYPE_AOP: + case MCA_BTL_OFI_TYPE_AFOP: + case MCA_BTL_OFI_TYPE_CSWAP: + /* call the callback */ + if (rdma_comp->cbfunc) { + rdma_comp->cbfunc(comp->btl, comp->endpoint, rdma_comp->local_address, + rdma_comp->local_handle, rdma_comp->cbcontext, + rdma_comp->cbdata, rc); + } + + MCA_BTL_OFI_NUM_RDMA_DEC((mca_btl_ofi_module_t *) comp->btl); + break; + + case MCA_BTL_OFI_TYPE_RECV: + mca_btl_ofi_recv_frag((mca_btl_ofi_module_t *) comp->btl, + (mca_btl_ofi_endpoint_t *) comp->endpoint, context, + frag_comp->frag, rc); + break; + + case MCA_BTL_OFI_TYPE_SEND: + MCA_BTL_OFI_NUM_SEND_DEC((mca_btl_ofi_module_t *) comp->btl); + mca_btl_ofi_frag_complete(frag_comp->frag, rc); + break; + + default: + /* catasthrophic */ + BTL_ERROR(("unknown completion type")); + MCA_BTL_OFI_ABORT(); + } + + /* return the completion handler */ + opal_free_list_return(comp->my_list, (opal_free_list_item_t *) comp); +} + int mca_btl_ofi_context_progress(mca_btl_ofi_context_t *context) { @@ -319,11 +369,6 @@ int mca_btl_ofi_context_progress(mca_btl_ofi_context_t *context) struct fi_cq_entry cq_entry[MCA_BTL_OFI_DEFAULT_MAX_CQE]; struct fi_cq_err_entry cqerr = {0}; - mca_btl_ofi_completion_context_t *c_ctx; - mca_btl_ofi_base_completion_t *comp; - mca_btl_ofi_rdma_completion_t *rdma_comp; - mca_btl_ofi_frag_completion_t *frag_comp; - ret = fi_cq_read(context->cq, &cq_entry, mca_btl_ofi_component.num_cqe_read); if (0 < ret) { @@ -331,49 +376,7 @@ int mca_btl_ofi_context_progress(mca_btl_ofi_context_t *context) for (int i = 0; i < events_read; i++) { if (NULL != cq_entry[i].op_context) { ++events; - - c_ctx = (mca_btl_ofi_completion_context_t *) cq_entry[i].op_context; - - /* We are casting to every type here just for simplicity. */ - comp = (mca_btl_ofi_base_completion_t *) c_ctx->comp; - frag_comp = (mca_btl_ofi_frag_completion_t *) c_ctx->comp; - rdma_comp = (mca_btl_ofi_rdma_completion_t *) c_ctx->comp; - - switch (comp->type) { - case MCA_BTL_OFI_TYPE_GET: - case MCA_BTL_OFI_TYPE_PUT: - case MCA_BTL_OFI_TYPE_AOP: - case MCA_BTL_OFI_TYPE_AFOP: - case MCA_BTL_OFI_TYPE_CSWAP: - /* call the callback */ - if (rdma_comp->cbfunc) { - rdma_comp->cbfunc(comp->btl, comp->endpoint, rdma_comp->local_address, - rdma_comp->local_handle, rdma_comp->cbcontext, - rdma_comp->cbdata, OPAL_SUCCESS); - } - - MCA_BTL_OFI_NUM_RDMA_DEC((mca_btl_ofi_module_t *) comp->btl); - break; - - case MCA_BTL_OFI_TYPE_RECV: - mca_btl_ofi_recv_frag((mca_btl_ofi_module_t *) comp->btl, - (mca_btl_ofi_endpoint_t *) comp->endpoint, context, - frag_comp->frag); - break; - - case MCA_BTL_OFI_TYPE_SEND: - MCA_BTL_OFI_NUM_SEND_DEC((mca_btl_ofi_module_t *) comp->btl); - mca_btl_ofi_frag_complete(frag_comp->frag, OPAL_SUCCESS); - break; - - default: - /* catasthrophic */ - BTL_ERROR(("unknown completion type")); - MCA_BTL_OFI_ABORT(); - } - - /* return the completion handler */ - opal_free_list_return(comp->my_list, (opal_free_list_item_t *) comp); + complete_op_context(context, cq_entry[i].op_context, OPAL_SUCCESS); } } } else if (OPAL_UNLIKELY(ret == -FI_EAVAIL)) { @@ -383,10 +386,35 @@ int mca_btl_ofi_context_progress(mca_btl_ofi_context_t *context) if (0 > ret) { BTL_ERROR(("%s:%d: Error returned from fi_cq_readerr: %s(%d)", __FILE__, __LINE__, fi_strerror(-ret), ret)); - } else { - BTL_ERROR(("fi_cq_readerr: (provider err_code = %d)\n", cqerr.prov_errno)); + MCA_BTL_OFI_ABORT(); + } else if(NULL != cqerr.op_context){ + switch(cqerr.err) { + case -FI_EIO: { + mca_btl_ofi_completion_context_t *c_ctx = + (mca_btl_ofi_completion_context_t*) cqerr.op_context; + mca_btl_ofi_base_completion_t *comp = + (mca_btl_ofi_base_completion_t*) c_ctx->comp; + mca_btl_ofi_module_t *ofi_btl = + (mca_btl_ofi_module_t*) comp->btl; + if(ofi_btl->ofi_error_cb){ + opal_proc_t *ep_proc = NULL; + if(comp->endpoint){ + ep_proc = comp->endpoint->ep_proc; + } + ofi_btl->ofi_error_cb(comp->btl, 0, ep_proc, + "IO error reported by libfabric"); + } + + ++events; + complete_op_context(context, cqerr.op_context, OPAL_ERR_UNREACH); + break; + } + default: + BTL_ERROR(("fi_cq_readerr: %s(%d) (provider err_code = %d)\n", + fi_strerror(-cqerr.err), cqerr.err, cqerr.prov_errno)); + MCA_BTL_OFI_ABORT(); + } } - MCA_BTL_OFI_ABORT(); } #ifdef FI_EINTR /* sometimes, sockets provider complain about interrupt. We do nothing. */ diff --git a/opal/mca/btl/ofi/btl_ofi_frag.c b/opal/mca/btl/ofi/btl_ofi_frag.c index 25433c0b6a3..e325dd34ccf 100644 --- a/opal/mca/btl/ofi/btl_ofi_frag.c +++ b/opal/mca/btl/ofi/btl_ofi_frag.c @@ -145,9 +145,9 @@ int mca_btl_ofi_send(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi } int mca_btl_ofi_recv_frag(mca_btl_ofi_module_t *ofi_btl, mca_btl_base_endpoint_t *endpoint, - mca_btl_ofi_context_t *context, mca_btl_ofi_base_frag_t *frag) + mca_btl_ofi_context_t *context, mca_btl_ofi_base_frag_t *frag, + int rc) { - int rc; mca_btl_active_message_callback_t *reg = mca_btl_base_active_message_trigger + frag->hdr.tag; mca_btl_base_segment_t segment = {.seg_addr.pval = (void *) (frag + 1), .seg_len = frag->hdr.len}; @@ -160,7 +160,7 @@ int mca_btl_ofi_recv_frag(mca_btl_ofi_module_t *ofi_btl, mca_btl_base_endpoint_t /* call the callback */ reg->cbfunc(&ofi_btl->super, &recv_desc); - mca_btl_ofi_frag_complete(frag, OPAL_SUCCESS); + mca_btl_ofi_frag_complete(frag, rc); /* repost the recv */ rc = mca_btl_ofi_post_recvs((mca_btl_base_module_t *) ofi_btl, context, 1); diff --git a/opal/mca/btl/ofi/btl_ofi_frag.h b/opal/mca/btl/ofi/btl_ofi_frag.h index 3afa8866265..786fafa3bbe 100644 --- a/opal/mca/btl/ofi/btl_ofi_frag.h +++ b/opal/mca/btl/ofi/btl_ofi_frag.h @@ -38,7 +38,8 @@ int mca_btl_ofi_send(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi mca_btl_base_descriptor_t *descriptor, mca_btl_base_tag_t tag); int mca_btl_ofi_recv_frag(mca_btl_ofi_module_t *ofi_btl, mca_btl_base_endpoint_t *endpoint, - mca_btl_ofi_context_t *context, mca_btl_ofi_base_frag_t *frag); + mca_btl_ofi_context_t *context, mca_btl_ofi_base_frag_t *frag, + int rc); struct mca_btl_base_descriptor_t *mca_btl_ofi_prepare_src(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, diff --git a/opal/mca/btl/ofi/btl_ofi_module.c b/opal/mca/btl/ofi/btl_ofi_module.c index 696a4614ac9..b985e06aab0 100644 --- a/opal/mca/btl/ofi/btl_ofi_module.c +++ b/opal/mca/btl/ofi/btl_ofi_module.c @@ -143,6 +143,14 @@ static int mca_btl_ofi_del_procs(mca_btl_base_module_t *btl, size_t nprocs, opal return OPAL_SUCCESS; } +static int mca_btl_ofi_register_error(mca_btl_base_module_t *btl, + mca_btl_base_module_error_cb_fn_t cb) +{ + mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; + ofi_btl->ofi_error_cb = cb; + return OPAL_SUCCESS; +} + void mca_btl_ofi_rcache_init(mca_btl_ofi_module_t *module) { if (!module->initialized) { @@ -519,4 +527,5 @@ mca_btl_ofi_module_t mca_btl_ofi_module_template = { .btl_add_procs = mca_btl_ofi_add_procs, .btl_del_procs = mca_btl_ofi_del_procs, .btl_finalize = mca_btl_ofi_finalize, + .btl_register_error = mca_btl_ofi_register_error, }}; diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c index 8f633d6b48a..5257070d404 100644 --- a/opal/mca/btl/smcuda/btl_smcuda_component.c +++ b/opal/mca/btl/smcuda/btl_smcuda_component.c @@ -236,7 +236,7 @@ static int smcuda_register(void) if (0 == mca_btl_smcuda.super.btl_accelerator_eager_limit) { mca_btl_smcuda.super.btl_accelerator_eager_limit = SIZE_MAX; /* magic number */ } -#endif /* OPAL_CUDA_SUPPORT */ +#endif /* OPAL_CUDA_GDR_SUPPORT */ return mca_btl_smcuda_component_verify(); } diff --git a/opal/mca/btl/tcp/btl_tcp_component.c b/opal/mca/btl/tcp/btl_tcp_component.c index 9b1252e56d3..6ff668b6f8b 100644 --- a/opal/mca/btl/tcp/btl_tcp_component.c +++ b/opal/mca/btl/tcp/btl_tcp_component.c @@ -59,7 +59,7 @@ #ifdef HAVE_SYS_TIME_H # include #endif -#if HAVE_SYS_UCRED_H +#ifdef HAVE_SYS_UCRED_H # include #endif /* HAVE_SYS_UCRED_H */ #ifdef HAVE_UNISTD_H diff --git a/opal/mca/common/ucx/common_ucx_wpool.c b/opal/mca/common/ucx/common_ucx_wpool.c index ae290201710..cd93f253e4d 100644 --- a/opal/mca/common/ucx/common_ucx_wpool.c +++ b/opal/mca/common/ucx/common_ucx_wpool.c @@ -32,6 +32,7 @@ __thread int initialized = 0; #endif bool opal_common_ucx_thread_enabled = false; +bool opal_common_ucx_single_threaded = true; opal_atomic_int64_t opal_common_ucx_ep_counts = 0; opal_atomic_int64_t opal_common_ucx_unpacked_rkey_counts = 0; @@ -55,7 +56,7 @@ static opal_common_ucx_winfo_t *_winfo_create(opal_common_ucx_wpool_t *wpool) if (opal_common_ucx_thread_enabled || wpool->dflt_winfo == NULL) { memset(&worker_params, 0, sizeof(worker_params)); worker_params.field_mask = UCP_WORKER_PARAM_FIELD_THREAD_MODE; - worker_params.thread_mode = UCS_THREAD_MODE_SINGLE; + worker_params.thread_mode = opal_common_ucx_single_threaded ? UCS_THREAD_MODE_SINGLE : UCS_THREAD_MODE_SERIALIZED; status = ucp_worker_create(wpool->ucp_ctx, &worker_params, &worker); if (UCS_OK != status) { MCA_COMMON_UCX_ERROR("ucp_worker_create failed: %d", status); diff --git a/opal/mca/common/ucx/common_ucx_wpool.h b/opal/mca/common/ucx/common_ucx_wpool.h index 0d94e51cb64..44bf55803d0 100644 --- a/opal/mca/common/ucx/common_ucx_wpool.h +++ b/opal/mca/common/ucx/common_ucx_wpool.h @@ -59,6 +59,7 @@ typedef struct { } opal_common_ucx_wpool_t; extern bool opal_common_ucx_thread_enabled; +extern bool opal_common_ucx_single_threaded; extern opal_atomic_int64_t opal_common_ucx_ep_counts; extern opal_atomic_int64_t opal_common_ucx_unpacked_rkey_counts; diff --git a/opal/mca/if/bsdx_ipv4/if_bsdx.c b/opal/mca/if/bsdx_ipv4/if_bsdx.c index 87bc27b8d42..3ed71ee5caf 100644 --- a/opal/mca/if/bsdx_ipv4/if_bsdx.c +++ b/opal/mca/if/bsdx_ipv4/if_bsdx.c @@ -39,7 +39,7 @@ opal_if_base_component_t mca_if_bsdx_ipv4_component = { {/* This component is checkpointable */ MCA_BASE_METADATA_PARAM_CHECKPOINT}, }; -MCA_BASE_COMPONENT_INIT(opal, if, bsdx_ipv4_component) +MCA_BASE_COMPONENT_INIT(opal, if, bsdx_ipv4) /* convert a netmask (in network byte order) to CIDR notation */ static int prefix(uint32_t netmask) diff --git a/opal/mca/threads/base/mutex.c b/opal/mca/threads/base/mutex.c index fec6cee1d98..322fc855eb8 100644 --- a/opal/mca/threads/base/mutex.c +++ b/opal/mca/threads/base/mutex.c @@ -35,6 +35,12 @@ */ bool opal_uses_threads = false; +/* + * Track if MPI is running in single-threaded mode (MPI_THREAD_SINGLE). + * Default is true until MPI_Init/MPI_Init_thread determines otherwise. + */ +bool opal_single_threaded = true; + static void mca_threads_mutex_constructor(opal_mutex_t *p_mutex) { #if OPAL_ENABLE_DEBUG diff --git a/opal/mca/threads/thread_usage.h b/opal/mca/threads/thread_usage.h index 4e2fd75a7e1..66b00eba4e5 100644 --- a/opal/mca/threads/thread_usage.h +++ b/opal/mca/threads/thread_usage.h @@ -33,6 +33,7 @@ #include "opal/sys/atomic.h" OPAL_DECLSPEC extern bool opal_uses_threads; +OPAL_DECLSPEC extern bool opal_single_threaded; /** * Check and see if the process is using multiple threads.